diff options
author | davidovski <david@sendula.com> | 2022-11-30 10:06:56 +0000 |
---|---|---|
committer | davidovski <david@sendula.com> | 2022-11-30 10:06:56 +0000 |
commit | 290c68795d8100cc97b8b53d80f331e536fc71b1 (patch) | |
tree | bf0068c4c9121406df9bc90f5c159fd93de8a61e /src/neuralnetwork/generate_alphabets.py |
Diffstat (limited to 'src/neuralnetwork/generate_alphabets.py')
-rw-r--r-- | src/neuralnetwork/generate_alphabets.py | 44 |
1 files changed, 44 insertions, 0 deletions
diff --git a/src/neuralnetwork/generate_alphabets.py b/src/neuralnetwork/generate_alphabets.py new file mode 100644 index 0000000..0b1e18e --- /dev/null +++ b/src/neuralnetwork/generate_alphabets.py @@ -0,0 +1,44 @@ +import training +import os +import sys + +if __name__ == "__main__": + + # List all the directories containing country datasets to populate the countries dictionary + countries = training.get_countries() + + for c, country in countries.items(): + print(f"processing {c}...", end="") + sys.stdout.flush() + + letters = {} + + # get all the names in a country's dataset + all_names = country.get_all() + + # iterate through all letters in the all of the names + for name in all_names: + + # preprocess the name + name = country.preprocess(name) + + for letter in name: + if letter in letters: + letters[letter] += 1 + else: + letters[letter] = 1 + + print(f" found {len(letters)} in {len(all_names)} names... ", end="") + sys.stdout.flush() + + # sort the letters by occurrence + letters_sorted = [l for l in letters] + letters_sorted.sort() + # output sorted letters to a file + with open(os.path.join(country.path, "alphabet.txt"), "w") as file: + for letter in letters_sorted: + file.write(letter) + file.write("\n") + + print("saved!") + sys.stdout.flush() |