Skip to content

Commit 117d534

Browse files
committed
feat: migrated from dump to sparql approach with optimised thresholds and rebased from upstream
1 parent 5d4ec20 commit 117d534

File tree

1,594 files changed

+17233
-52344
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,594 files changed

+17233
-52344
lines changed

query_check_sparql_service_features.json

Lines changed: 5139 additions & 0 deletions
Large diffs are not rendered by default.

src/scribe_data/check/check_missing_forms/check_missing_forms.py

Lines changed: 286 additions & 342 deletions
Large diffs are not rendered by default.

src/scribe_data/check/check_missing_forms/get_forms.py

Lines changed: 2 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# SPDX-License-Identifier: GPL-3.0-or-later
22
"""
3-
Get forms from Wikidata.
3+
Get forms from Wikidata SPARQL query files.
44
"""
55

66
import re
@@ -12,7 +12,6 @@
1212
from scribe_data.utils import (
1313
language_metadata,
1414
)
15-
from scribe_data.wikidata.parse_dump import LexemeProcessor
1615

1716
iso_to_qid = {
1817
lang_data["iso"]: lang_data["qid"]
@@ -103,33 +102,4 @@ def parse_sparql_query(query_text):
103102
return result
104103

105104

106-
def extract_dump_forms(
107-
languages=None, data_types=None, file_path="latest-lexemes.json.bz2"
108-
):
109-
"""
110-
Extract unique grammatical features from Wikidata lexeme dump.
111-
112-
Parameters
113-
----------
114-
languages : list of str, optional
115-
List of language ISO codes (e.g., ['en', 'fr']).
116-
117-
data_types : list of str, optional
118-
List of lexical categories (e.g., ['nouns', 'verbs']).
119-
120-
file_path : str, optional
121-
Path to the lexeme dump file, by default "latest-lexemes.json.bz2".
122-
123-
Returns
124-
-------
125-
dict
126-
Dictionary of unique grammatical features per language and lexical category.
127-
Format: {language_qid: {data_type_qid: features}}.
128-
"""
129-
processor = LexemeProcessor(
130-
target_lang=languages, parse_type=["form"], data_types=data_types
131-
)
132-
133-
processor.process_file(file_path)
134-
135-
return dict(processor.unique_forms)
105+
# NOTE: extract_dump_forms() removed as we're no longer using dump based approach
Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,25 @@
11
# tool: scribe-data
2-
# Universal query to find all grammatical feature combinations for a given language and data type.
2+
# Universal query to find grammatical feature combinations for a given language and data type.
33
# This query identifies form combinations that exist in Wikidata for the specified language/data type.
4-
# Placeholders: {LANGUAGE_QID} and {DATA_TYPE_QID} should be replaced with actual QIDs.
4+
# Placeholders: {LANGUAGE_QID}, {DATA_TYPE_QID}, {MIN_FREQUENCY}, and {MAX_RESULTS} should be replaced with actual values.
5+
# Format matches Wikidata query service standard (w.wiki/FesH).
56

6-
SELECT ?features (COUNT(*) AS ?frequency) WHERE {{
7-
?lexeme dct:language wd:{LANGUAGE_QID} ;
8-
wikibase:lexicalCategory wd:{DATA_TYPE_QID} .
9-
10-
?lexeme ontolex:lexicalForm ?form .
11-
12-
# Collect all grammatical features for this form
13-
?form wikibase:grammaticalFeature ?feature .
14-
15-
# Group features by lexical form to get combinations
7+
SELECT ?comboQIDs (COUNT(?form) AS ?formsWithThisCombo)
8+
WHERE {{
169
{{
17-
SELECT ?form (GROUP_CONCAT(DISTINCT ?feature; separator="|") AS ?features) WHERE {{
18-
?lexeme dct:language wd:{LANGUAGE_QID} ;
19-
wikibase:lexicalCategory wd:{DATA_TYPE_QID} .
20-
21-
?lexeme ontolex:lexicalForm ?form .
22-
?form wikibase:grammaticalFeature ?feature .
10+
SELECT ?form
11+
(GROUP_CONCAT(DISTINCT REPLACE(STR(?feature), "^.*/(Q\\d+)$", "$1")) AS ?comboQIDs)
12+
WHERE {{
13+
?lexeme dct:language wd:{LANGUAGE_QID};
14+
wikibase:lexicalCategory wd:{DATA_TYPE_QID};
15+
ontolex:lexicalForm ?form.
16+
?form wikibase:grammaticalFeature ?feature.
2317
}}
2418
GROUP BY ?form
19+
LIMIT {MAX_RESULTS}
2520
}}
21+
FILTER(?comboQIDs != "")
2622
}}
27-
GROUP BY ?features
28-
ORDER BY DESC(?frequency)
23+
GROUP BY ?comboQIDs
24+
HAVING(COUNT(?form) >= {MIN_FREQUENCY})
25+
ORDER BY DESC(?formsWithThisCombo)

src/scribe_data/resources/language_metadata.json

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
"qid": "Q9610"
1313
},
1414
"chinese": {
15+
"qid": "Q7850",
1516
"sub_languages": {
1617
"mandarin": {
1718
"iso": "zh",
@@ -68,14 +69,15 @@
6869
"qid": "Q9288"
6970
},
7071
"hindustani": {
72+
"qid": "Q11051",
7173
"sub_languages": {
7274
"hindi": {
7375
"iso": "hi",
74-
"qid": "Q11051"
76+
"qid": "Q1568"
7577
},
7678
"urdu": {
7779
"iso": "ur",
78-
"qid": "Q11051"
80+
"qid": "Q1617"
7981
}
8082
}
8183
},
@@ -120,6 +122,7 @@
120122
"qid": "Q36236"
121123
},
122124
"norwegian": {
125+
"qid": "Q9043",
123126
"sub_languages": {
124127
"bokmål": {
125128
"iso": "nb",
@@ -152,6 +155,7 @@
152155
"qid": "Q5146"
153156
},
154157
"punjabi": {
158+
"qid": "Q58635",
155159
"sub_languages": {
156160
"gurmukhi": {
157161
"iso": "pa",
@@ -168,6 +172,7 @@
168172
"qid": "Q7737"
169173
},
170174
"sami": {
175+
"qid": "Q33947",
171176
"sub_languages": {
172177
"northern": {
173178
"iso": "se",

src/scribe_data/wikidata/language_data_extraction/arabic/adjectives/query_adjectives_1.sparql

Lines changed: 12 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,9 @@ SELECT
66
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
77
?lastModified
88
?adjective
9-
?nominativeSingularConstruct
10-
?nominativePluralConstruct
11-
?genitiveSingularConstruct
12-
?genitivePluralConstruct
13-
?accusativePluralConstruct
14-
?nominativePausalMasculineIndefiniteSingular
9+
?pluralFeminine
10+
?pluralMasculine
11+
?singularMasculineIndefinitePausal
1512

1613
WHERE {
1714
?lexeme dct:language wd:Q13955 ;
@@ -20,38 +17,20 @@ WHERE {
2017
schema:dateModified ?lastModified .
2118

2219
OPTIONAL {
23-
?lexeme ontolex:lexicalForm ?nominativeSingularConstructForm .
24-
?nominativeSingularConstructForm ontolex:representation ?nominativeSingularConstruct ;
25-
wikibase:grammaticalFeature wd:Q131105, wd:Q110786, wd:Q1641446 .
20+
?lexeme ontolex:lexicalForm ?pluralFeminineForm .
21+
?pluralFeminineForm ontolex:representation ?pluralFeminine ;
22+
wikibase:grammaticalFeature wd:Q146786, wd:Q1775415 .
2623
}
2724

2825
OPTIONAL {
29-
?lexeme ontolex:lexicalForm ?nominativePluralConstructForm .
30-
?nominativePluralConstructForm ontolex:representation ?nominativePluralConstruct ;
31-
wikibase:grammaticalFeature wd:Q131105, wd:Q146786, wd:Q1641446 .
26+
?lexeme ontolex:lexicalForm ?pluralMasculineForm .
27+
?pluralMasculineForm ontolex:representation ?pluralMasculine ;
28+
wikibase:grammaticalFeature wd:Q146786, wd:Q499327 .
3229
}
3330

3431
OPTIONAL {
35-
?lexeme ontolex:lexicalForm ?genitiveSingularConstructForm .
36-
?genitiveSingularConstructForm ontolex:representation ?genitiveSingularConstruct ;
37-
wikibase:grammaticalFeature wd:Q146233, wd:Q110786, wd:Q1641446 .
38-
}
39-
40-
OPTIONAL {
41-
?lexeme ontolex:lexicalForm ?genitivePluralConstructForm .
42-
?genitivePluralConstructForm ontolex:representation ?genitivePluralConstruct ;
43-
wikibase:grammaticalFeature wd:Q146233, wd:Q146786, wd:Q1641446 .
44-
}
45-
46-
OPTIONAL {
47-
?lexeme ontolex:lexicalForm ?accusativePluralConstructForm .
48-
?accusativePluralConstructForm ontolex:representation ?accusativePluralConstruct ;
49-
wikibase:grammaticalFeature wd:Q146078, wd:Q146786, wd:Q1641446 .
50-
}
51-
52-
OPTIONAL {
53-
?lexeme ontolex:lexicalForm ?nominativePausalMasculineIndefiniteSingularForm .
54-
?nominativePausalMasculineIndefiniteSingularForm ontolex:representation ?nominativePausalMasculineIndefiniteSingular ;
55-
wikibase:grammaticalFeature wd:Q131105, wd:Q117262361, wd:Q499327, wd:Q53997857, wd:Q110786 .
32+
?lexeme ontolex:lexicalForm ?singularMasculineIndefinitePausalForm .
33+
?singularMasculineIndefinitePausalForm ontolex:representation ?singularMasculineIndefinitePausal ;
34+
wikibase:grammaticalFeature wd:Q110786, wd:Q499327, wd:Q53997857, wd:Q117262361 .
5635
}
5736
}

src/scribe_data/wikidata/language_data_extraction/arabic/adjectives/query_adjectives_10.sparql

Lines changed: 16 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,10 @@ SELECT
66
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
77
?lastModified
88
?adjective
9-
?nominativeMasculineSingular
10-
?nominativeMasculineDefiniteSingular
11-
?nominativeMasculineDefiniteDual
12-
?nominativeMasculineSingularConstruct
13-
?nominativeMasculinePluralConstruct
9+
?singularAccusativeFeminineIndefinite
10+
?singularAccusativeFeminineDefinite
11+
?singularAccusativeMasculineIndefinite
12+
?singularAccusativeMasculineDefinite
1413

1514
WHERE {
1615
?lexeme dct:language wd:Q13955 ;
@@ -19,32 +18,26 @@ WHERE {
1918
schema:dateModified ?lastModified .
2019

2120
OPTIONAL {
22-
?lexeme ontolex:lexicalForm ?nominativeMasculineSingularForm .
23-
?nominativeMasculineSingularForm ontolex:representation ?nominativeMasculineSingular ;
24-
wikibase:grammaticalFeature wd:Q131105, wd:Q499327, wd:Q110786 .
21+
?lexeme ontolex:lexicalForm ?singularAccusativeFeminineIndefiniteForm .
22+
?singularAccusativeFeminineIndefiniteForm ontolex:representation ?singularAccusativeFeminineIndefinite ;
23+
wikibase:grammaticalFeature wd:Q110786, wd:Q146078, wd:Q1775415, wd:Q53997857 .
2524
}
2625

2726
OPTIONAL {
28-
?lexeme ontolex:lexicalForm ?nominativeMasculineDefiniteSingularForm .
29-
?nominativeMasculineDefiniteSingularForm ontolex:representation ?nominativeMasculineDefiniteSingular ;
30-
wikibase:grammaticalFeature wd:Q131105, wd:Q499327, wd:Q53997851, wd:Q110786 .
27+
?lexeme ontolex:lexicalForm ?singularAccusativeFeminineDefiniteForm .
28+
?singularAccusativeFeminineDefiniteForm ontolex:representation ?singularAccusativeFeminineDefinite ;
29+
wikibase:grammaticalFeature wd:Q110786, wd:Q146078, wd:Q1775415, wd:Q53997851 .
3130
}
3231

3332
OPTIONAL {
34-
?lexeme ontolex:lexicalForm ?nominativeMasculineDefiniteDualForm .
35-
?nominativeMasculineDefiniteDualForm ontolex:representation ?nominativeMasculineDefiniteDual ;
36-
wikibase:grammaticalFeature wd:Q131105, wd:Q499327, wd:Q53997851, wd:Q110022 .
33+
?lexeme ontolex:lexicalForm ?singularAccusativeMasculineIndefiniteForm .
34+
?singularAccusativeMasculineIndefiniteForm ontolex:representation ?singularAccusativeMasculineIndefinite ;
35+
wikibase:grammaticalFeature wd:Q110786, wd:Q146078, wd:Q499327, wd:Q53997857 .
3736
}
3837

3938
OPTIONAL {
40-
?lexeme ontolex:lexicalForm ?nominativeMasculineSingularConstructForm .
41-
?nominativeMasculineSingularConstructForm ontolex:representation ?nominativeMasculineSingularConstruct ;
42-
wikibase:grammaticalFeature wd:Q131105, wd:Q499327, wd:Q110786, wd:Q1641446 .
43-
}
44-
45-
OPTIONAL {
46-
?lexeme ontolex:lexicalForm ?nominativeMasculinePluralConstructForm .
47-
?nominativeMasculinePluralConstructForm ontolex:representation ?nominativeMasculinePluralConstruct ;
48-
wikibase:grammaticalFeature wd:Q131105, wd:Q499327, wd:Q146786, wd:Q1641446 .
39+
?lexeme ontolex:lexicalForm ?singularAccusativeMasculineDefiniteForm .
40+
?singularAccusativeMasculineDefiniteForm ontolex:representation ?singularAccusativeMasculineDefinite ;
41+
wikibase:grammaticalFeature wd:Q110786, wd:Q146078, wd:Q499327, wd:Q53997851 .
4942
}
5043
}

src/scribe_data/wikidata/language_data_extraction/arabic/adjectives/query_adjectives_11.sparql

Lines changed: 0 additions & 50 deletions
This file was deleted.

src/scribe_data/wikidata/language_data_extraction/arabic/adjectives/query_adjectives_12.sparql

Lines changed: 0 additions & 50 deletions
This file was deleted.

0 commit comments

Comments
 (0)