Skip to content

Commit 54bd845

Browse files
committed
fix: Correct sub language directory issues and test expectations
1 parent 117d534 commit 54bd845

File tree

119 files changed

+441
-1253
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

119 files changed

+441
-1253
lines changed

src/scribe_data/check/check_missing_forms/check_missing_forms.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -390,18 +390,15 @@ def process_missing_features(missing_features, query_dir):
390390
"""
391391
if not missing_features:
392392
return
393-
sub_languages_iso_codes = {}
394-
for language, sub_langs in sub_languages.items():
395-
# Get all unique QIDs and their ISO codes for this language.
396-
qid_to_isos = {}
397-
for iso_code, sub_lang_data in sub_langs.items():
398-
qid = sub_lang_data["qid"]
399-
if qid not in qid_to_isos:
400-
qid_to_isos[qid] = set()
401-
qid_to_isos[qid].add(iso_code)
402-
403-
# Add to main dictionary.
404-
sub_languages_iso_codes |= qid_to_isos
393+
394+
# Map parent language QID to list of sub-language ISO codes
395+
# For example: Q11051 (Hindustani) -> ["hi", "ur"]
396+
parent_qid_to_iso_codes = {}
397+
for parent_lang_name, sub_langs in sub_languages.items():
398+
# Get the parent language's QID from metadata
399+
parent_qid = language_metadata.get(parent_lang_name, {}).get("qid")
400+
if parent_qid:
401+
parent_qid_to_iso_codes[parent_qid] = list(sub_langs.keys())
405402

406403
for language_qid, data_types_qid in missing_features.items():
407404
try:
@@ -421,10 +418,10 @@ def process_missing_features(missing_features, query_dir):
421418
continue
422419

423420
language_entry = {language_qid: {data_type_qid: features}}
424-
if language_qid in sub_languages_iso_codes:
421+
if language_qid in parent_qid_to_iso_codes:
425422
# For macro-languages, generate a separate set of files
426423
# for each sub-language, each with a specific filter.
427-
for sub_lang_iso_code in sub_languages_iso_codes[language_qid]:
424+
for sub_lang_iso_code in parent_qid_to_iso_codes[language_qid]:
428425
print(
429426
f"Generating query for {language_qid} - {data_type_qid} - {sub_lang_iso_code}"
430427
)

src/scribe_data/check/check_missing_forms/generate_query.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -114,19 +114,28 @@ def generate_query(missing_features, query_dir=None, sub_lang_iso_code=None):
114114

115115
# Find the language entry by QID.
116116
language_entry = None
117+
parent_language = None
118+
sub_language_name = None
119+
117120
for name, data in language_metadata.items():
118121
if data.get("qid") == language_qid:
119122
language_entry = (name, data)
123+
# If this language has sub-languages and we have a sub_lang_iso_code,
124+
# look up the sub-language name
125+
if sub_lang_iso_code and "sub_languages" in data:
126+
# Look up the sub-language name by ISO code
127+
if sub_lang_iso_code in sub_languages.get(name, {}):
128+
parent_language = name
129+
sub_language_name = sub_languages[name][sub_lang_iso_code]["name"]
120130
break
121131
# Check sub-languages if main language not found
122132
if "sub_languages" in data:
123133
for sub_name, sub_data in data["sub_languages"].items():
124134
if sub_data.get("qid") == language_qid:
125-
# Use main language name instead of sub_name.
126-
language_entry = (
127-
name,
128-
sub_data,
129-
)
135+
# Keep track of parent and sub-language names
136+
language_entry = (name, sub_data)
137+
parent_language = name
138+
sub_language_name = sub_name
130139
break
131140
if language_entry is None:
132141
raise ValueError(f"Language with QID {language_qid} not found in metadata")
@@ -219,13 +228,21 @@ def generate_query(missing_features, query_dir=None, sub_lang_iso_code=None):
219228
final_query = main_body + where_clause + optional_clauses + "}\n"
220229

221230
# Create base filename.
222-
if sub_lang_iso_code:
223-
base_file_name = f"{query_dir}/{language}/{sub_lang_name}/{data_type}/query_{data_type}.sparql"
231+
# If this is a sub-language, place it under parent_language/sub_language/data_type/
232+
# Otherwise, place it directly under language/data_type/
233+
if sub_lang_iso_code and parent_language and sub_language_name:
234+
# Sub-language: parent/sub/datatype structure
235+
if query_dir:
236+
base_file_name = Path(query_dir) / parent_language / sub_language_name / data_type / f"query_{data_type}.sparql"
237+
else:
238+
base_file_name = Path(language_data_extraction) / parent_language / sub_language_name / data_type / f"query_{data_type}.sparql"
224239
elif query_dir:
240+
# Regular language with query_dir specified
225241
base_file_name = (
226242
Path(query_dir) / language / data_type / f"query_{data_type}.sparql"
227243
)
228244
else:
245+
# Regular language with default directory
229246
base_file_name = f"{language_data_extraction}/{language}/{data_type}/query_{data_type}.sparql"
230247

231248
# Get the next available filename.

src/scribe_data/wikidata/language_data_extraction/bokmål/articles/query_articles.sparql

Lines changed: 0 additions & 14 deletions
This file was deleted.

src/scribe_data/wikidata/language_data_extraction/bokmål/conjunctions/query_conjunctions.sparql

Lines changed: 0 additions & 14 deletions
This file was deleted.

src/scribe_data/wikidata/language_data_extraction/bokmål/postpositions/query_postpositions.sparql

Lines changed: 0 additions & 14 deletions
This file was deleted.

src/scribe_data/wikidata/language_data_extraction/bokmål/prepositions/query_prepositions.sparql

Lines changed: 0 additions & 14 deletions
This file was deleted.

src/scribe_data/wikidata/language_data_extraction/bokmål/pronouns/query_pronouns.sparql

Lines changed: 0 additions & 14 deletions
This file was deleted.

src/scribe_data/wikidata/language_data_extraction/bokmål/proper_nouns/query_proper_nouns.sparql

Lines changed: 0 additions & 14 deletions
This file was deleted.

src/scribe_data/wikidata/language_data_extraction/bokmål/translations/query_translations.sparql

Lines changed: 0 additions & 14 deletions
This file was deleted.

src/scribe_data/wikidata/language_data_extraction/chinese/adjectives/query_adjectives.sparql renamed to src/scribe_data/wikidata/language_data_extraction/chinese/mandarin/adjectives/query_adjectives.sparql

File renamed without changes.

0 commit comments

Comments
 (0)