Skip to content

Commit 516c417

Browse files
Copilotjosix
andcommitted
Implement terminology extraction tools and generate translation dictionaries
Co-authored-by: josix <18432820+josix@users.noreply.github.com>
1 parent c63f0d6 commit 516c417

File tree

6 files changed

+18528
-1
lines changed

6 files changed

+18528
-1
lines changed

.scripts/README.md

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,34 @@
22

33
Useful scripts for the translation.
44

5+
## Translation Dictionary Generation
6+
7+
Extract and build a translation dictionary for terminologies across different .po files to maintain consistency.
8+
9+
### extract_terminology.py
10+
Main script that processes all .po files and extracts terminology:
11+
12+
```sh
13+
python3 .scripts/extract_terminology.py
14+
```
15+
16+
Generates `terminology_dictionary.csv` with all extracted terms and their translations.
17+
18+
### create_focused_dictionary.py
19+
Creates a curated dictionary focusing on the most important Python terminology:
20+
21+
```sh
22+
python3 .scripts/create_focused_dictionary.py
23+
```
24+
25+
Generates `focused_terminology_dictionary.csv` with categorized high-priority terms.
26+
27+
See the terminology documentation for detailed usage and integration with translation workflow.
28+
529
## From Google Translation
630

731
Translate all untranslated entries of the given .po file with Google Translate.
832

9-
1033
```sh
1134
.scripts/google_translate.sh library/csv.po
1235
```

.scripts/create_focused_dictionary.py

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Create a focused terminology dictionary for the most important Python terms.
4+
5+
This script extracts the most critical Python terminology for translation consistency.
6+
"""
7+
8+
import csv
9+
from collections import defaultdict, Counter
10+
11+
12+
def create_focused_dictionary():
13+
"""Create a focused dictionary with the most important terms."""
14+
15+
# Read the full terminology dictionary
16+
important_terms = []
17+
18+
with open("terminology_dictionary.csv", 'r', encoding='utf-8') as csvfile:
19+
reader = csv.DictReader(csvfile)
20+
21+
for row in reader:
22+
source_term = row['source_term'].strip()
23+
frequency = int(row['frequency'])
24+
files_count = int(row['files_count'])
25+
26+
# Focus on high-priority terms
27+
is_important = False
28+
29+
# High priority: Python built-in types and keywords
30+
if source_term.lower() in {
31+
'class', 'function', 'method', 'module', 'package', 'object', 'type',
32+
'int', 'str', 'list', 'dict', 'tuple', 'set', 'float', 'bool', 'complex',
33+
'none', 'true', 'false', 'return', 'import', 'def', 'async', 'await',
34+
'lambda', 'yield', 'raise', 'try', 'except', 'finally', 'with', 'as'
35+
}:
36+
is_important = True
37+
38+
# High priority: Common Python concepts
39+
elif any(concept in source_term.lower() for concept in [
40+
'exception', 'error', 'iterator', 'generator', 'decorator', 'property',
41+
'classmethod', 'staticmethod', 'metaclass', 'inheritance', 'polymorphism'
42+
]):
43+
is_important = True
44+
45+
# High priority: Terms that appear in many files (widespread usage)
46+
elif files_count >= 20 and frequency >= 10:
47+
is_important = True
48+
49+
# Medium priority: Code elements in backticks
50+
elif '`' in source_term or source_term.startswith('__') and source_term.endswith('__'):
51+
is_important = True
52+
53+
# Medium priority: Terms with technical patterns
54+
elif any(pattern in source_term for pattern in ['()', 'Error', 'Exception', 'Class']):
55+
is_important = True
56+
57+
if is_important:
58+
important_terms.append(row)
59+
60+
# Sort by frequency (most common first)
61+
important_terms.sort(key=lambda x: int(x['frequency']), reverse=True)
62+
63+
# Write focused dictionary
64+
with open("focused_terminology_dictionary.csv", 'w', newline='', encoding='utf-8') as csvfile:
65+
fieldnames = ['source_term', 'translated_term', 'frequency', 'files_count',
66+
'priority', 'category', 'example_files']
67+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
68+
69+
writer.writeheader()
70+
71+
for term_data in important_terms:
72+
source_term = term_data['source_term'].strip()
73+
74+
# Categorize the term
75+
category = 'Other'
76+
priority = 'Medium'
77+
78+
if source_term.lower() in {
79+
'class', 'function', 'method', 'module', 'package', 'object', 'type'
80+
}:
81+
category = 'Core Concepts'
82+
priority = 'High'
83+
elif source_term.lower() in {
84+
'int', 'str', 'list', 'dict', 'tuple', 'set', 'float', 'bool', 'complex'
85+
}:
86+
category = 'Built-in Types'
87+
priority = 'High'
88+
elif source_term.lower() in {
89+
'none', 'true', 'false', 'return', 'import', 'def', 'async', 'await'
90+
}:
91+
category = 'Keywords/Constants'
92+
priority = 'High'
93+
elif 'error' in source_term.lower() or 'exception' in source_term.lower():
94+
category = 'Exceptions'
95+
priority = 'High'
96+
elif '`' in source_term:
97+
category = 'Code Elements'
98+
priority = 'Medium'
99+
elif int(term_data['files_count']) >= 50:
100+
category = 'Common Terms'
101+
priority = 'High'
102+
103+
writer.writerow({
104+
'source_term': source_term,
105+
'translated_term': term_data['translated_term'],
106+
'frequency': term_data['frequency'],
107+
'files_count': term_data['files_count'],
108+
'priority': priority,
109+
'category': category,
110+
'example_files': term_data['example_files']
111+
})
112+
113+
print(f"Created focused terminology dictionary with {len(important_terms)} important terms")
114+
115+
# Print category statistics
116+
categories = defaultdict(int)
117+
priorities = defaultdict(int)
118+
119+
for term in important_terms:
120+
source_term = term['source_term'].strip()
121+
if source_term.lower() in {'class', 'function', 'method', 'module', 'package', 'object', 'type'}:
122+
categories['Core Concepts'] += 1
123+
elif source_term.lower() in {'int', 'str', 'list', 'dict', 'tuple', 'set', 'float', 'bool', 'complex'}:
124+
categories['Built-in Types'] += 1
125+
elif source_term.lower() in {'none', 'true', 'false', 'return', 'import', 'def', 'async', 'await'}:
126+
categories['Keywords/Constants'] += 1
127+
elif 'error' in source_term.lower() or 'exception' in source_term.lower():
128+
categories['Exceptions'] += 1
129+
elif '`' in source_term:
130+
categories['Code Elements'] += 1
131+
else:
132+
categories['Common Terms'] += 1
133+
134+
print("\nCategory breakdown:")
135+
for category, count in categories.items():
136+
print(f" {category}: {count} terms")
137+
138+
139+
if __name__ == "__main__":
140+
create_focused_dictionary()

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy