From 290c68795d8100cc97b8b53d80f331e536fc71b1 Mon Sep 17 00:00:00 2001 From: davidovski Date: Wed, 30 Nov 2022 10:06:56 +0000 Subject: Added files to repository --- src/neuralnetwork/generate_alphabets.py | 44 +++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 src/neuralnetwork/generate_alphabets.py (limited to 'src/neuralnetwork/generate_alphabets.py') diff --git a/src/neuralnetwork/generate_alphabets.py b/src/neuralnetwork/generate_alphabets.py new file mode 100644 index 0000000..0b1e18e --- /dev/null +++ b/src/neuralnetwork/generate_alphabets.py @@ -0,0 +1,44 @@ +import training +import os +import sys + +if __name__ == "__main__": + + # List all the directories containing country datasets to populate the countries dictionary + countries = training.get_countries() + + for c, country in countries.items(): + print(f"processing {c}...", end="") + sys.stdout.flush() + + letters = {} + + # get all the names in a country's dataset + all_names = country.get_all() + + # iterate through all letters in the all of the names + for name in all_names: + + # preprocess the name + name = country.preprocess(name) + + for letter in name: + if letter in letters: + letters[letter] += 1 + else: + letters[letter] = 1 + + print(f" found {len(letters)} in {len(all_names)} names... ", end="") + sys.stdout.flush() + + # sort the letters by occurrence + letters_sorted = [l for l in letters] + letters_sorted.sort() + # output sorted letters to a file + with open(os.path.join(country.path, "alphabet.txt"), "w") as file: + for letter in letters_sorted: + file.write(letter) + file.write("\n") + + print("saved!") + sys.stdout.flush() -- cgit v1.2.1