blob: 345218e64277bf693a7d9a0d3419054c1bf3ca23 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
|
import training
import os
import sys
if __name__ == "__main__":
countries = training.get_countries()
for c, country in countries.items():
if c == "uk" or c == "usa":
# iterate through each dataset separately
for dataset, path in country.datasets.items():
# print information
print(f"filtering through {c}'s {dataset}...")
sys.stdout.flush()
names = country.get_names(dataset)
# store the names that are valid in a seperate list
names_output = []
# load the alphabet file for the country
alphabet = []
with open(os.path.join(country.path, "alphabet.txt"), "r") as file:
for l in file.read().split("\n"):
alphabet.append(l)
c = 0
t = len(names)
# iterate through names in the dataset
for name in names:
name = country.preprocess(name)
valid = True
# invalidate the name if a single letter is not in the alphabet
for letter in name:
if not letter in alphabet:
valid = False
break
if valid:
names_output.append(name)
c += 1
if c % 128 == 0:
print(f"\r{c}/{t}", end="")
# print how many names are left
print(f"kept {len(names_output)}/{len(names)} names")
# save dataset
with open(path, "w") as file:
for name in names_output:
file.write(name)
file.write("\n")
|