Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
GokulNC committed Aug 5, 2021
0 parents commit cd2a1c0
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
*.pyc
.vs/
tmp/
*.tsv
9 changes: 9 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[submodule "raw/eng2tamildictionary"]
path = raw/eng2tamildictionary
url = https://github.com/linuxkathirvel/eng2tamildictionary
[submodule "raw/english-tamil-dictionary-api"]
path = raw/english-tamil-dictionary-api
url = https://github.com/abuvanth/english-tamil-dictionary-api
[submodule "raw/dictionary"]
path = raw/dictionary
url = https://github.com/sathia27/dictionary
62 changes: 62 additions & 0 deletions consolidate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from tqdm import tqdm
import re
import pandas as pd

en_lang_pairs_count = {}

def add_pair(en_word, lang_word):
if type(en_word) is not str or type(lang_word) is not str:
return
en_word = en_word.replace('"', '').strip().lower().rstrip('.')
lang_word = lang_word.replace('"', '').strip().rstrip('.')
key = (en_word, lang_word)
if len(key[0])<2 or len(key[1])<2:
return
if key in en_lang_pairs_count:
en_lang_pairs_count[key] += 1
else:
en_lang_pairs_count[key] = 1
return

###################################################################

# https://github.com/linuxkathirvel/eng2tamildictionary

import json
data = json.load(open('raw/eng2tamildictionary/dictionary.json', encoding="utf-8"))

for record in tqdm(data, desc='eng2tamildictionary'):
try:
en_word, lang_words = record["eng"], record["tamil"]
except:
if 'word_list' in record:
continue
raise

en_word = re.sub('\([^)]*?\)', '', en_word) # Remove brackets
lang_words = re.sub('\([^)]*?\)', '', lang_words) # Remove brackets
lang_words = re.sub("[A-Za-z]\.?", ' ', lang_words) # Remove English words from tamil
lang_words = re.sub("-[1-9]", ' ', lang_words) # Remove numbered bullets
for lang_word in lang_words.split(','):
add_pair(en_word, lang_word)

# https://github.com/abuvanth/english-tamil-dictionary-api
# Same as above, so ignore.

# https://github.com/sathia27/dictionary
# Bad format. TODO: Clean and parse

# import sqlite3, os
# con = sqlite3.connect('raw/dictionary/word.db', isolation_level=None, detect_types=sqlite3.PARSE_COLNAMES)

# df = pd.read_sql_query("SELECT * FROM words", con)
# os.makedirs('tmp', exist_ok=True)
# df.to_csv("tmp/dict.csv")

## WRITE

out = open('consolidated.tsv', 'w', encoding='utf-8')
out.write(f"ENG\tLANG\tCOUNT\n")
for (en_word, lang_word), count in en_lang_pairs_count.items():
out.write(f"{en_word}\t{lang_word}\t{count}\n")
out.close()
1 change: 1 addition & 0 deletions raw/dictionary
Submodule dictionary added at 6f5049
1 change: 1 addition & 0 deletions raw/eng2tamildictionary
Submodule eng2tamildictionary added at a4e069
1 change: 1 addition & 0 deletions raw/english-tamil-dictionary-api

0 comments on commit cd2a1c0

Please sign in to comment.