blob: 0b1e18ee98e80c109828e4e0bf626a0b83bcb54f (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
|
import training
import os
import sys
if __name__ == "__main__":
# List all the directories containing country datasets to populate the countries dictionary
countries = training.get_countries()
for c, country in countries.items():
print(f"processing {c}...", end="")
sys.stdout.flush()
letters = {}
# get all the names in a country's dataset
all_names = country.get_all()
# iterate through all letters in the all of the names
for name in all_names:
# preprocess the name
name = country.preprocess(name)
for letter in name:
if letter in letters:
letters[letter] += 1
else:
letters[letter] = 1
print(f" found {len(letters)} in {len(all_names)} names... ", end="")
sys.stdout.flush()
# sort the letters by occurrence
letters_sorted = [l for l in letters]
letters_sorted.sort()
# output sorted letters to a file
with open(os.path.join(country.path, "alphabet.txt"), "w") as file:
for letter in letters_sorted:
file.write(letter)
file.write("\n")
print("saved!")
sys.stdout.flush()
|