elimu-ai · SnehaHS65 · Aug 11, 2024 · Aug 11, 2024 · Aug 12, 2024 · Aug 12, 2024
@@ -17,7 +17,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11"]
+        python-version: ["3.10", "3.11", "3.12"]
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}

@@ -0,0 +1,5 @@
+import nltk
+
+# Download required NLTK data
+nltk.download('punkt')
+nltk.download('punkt_tab')
@@ -0,0 +1,264 @@
+मैं
+मुझको
+मेरा
+अपने आप को
+हमने
+हमारा
+अपना
+हम
+आप
+आपका
+तुम्हारा
+अपने आप
+स्वयं
+वह
+इसे
+उसके
+खुद को
+कि वह
+उसकी
+उसका
+खुद ही
+यह
+इसके
+उन्होने
+अपने
+क्या
+जो
+किसे
+किसको
+कि
+ये
+हूँ
+होता है
+रहे
+थी
+थे
+होना
+गया
+किया जा रहा है
+किया है
+है
+पडा
+होने
+करना
+करता है
+किया
+रही
+एक
+लेकिन
+अगर
+या
+क्यूंकि
+जैसा
+जब तक
+जबकि
+की
+पर
+द्वारा
+के लिए
+साथ
+के बारे में
+खिलाफ
+बीच
+में
+के माध्यम से
+दौरान
+से पहले
+के बाद
+ऊपर
+नीचे
+को
+से
+तक
+से नीचे
+करने में
+निकल
+बंद
+से अधिक
+तहत
+दुबारा
+आगे
+फिर
+एक बार
+यहाँ
+वहाँ
+कब
+कहाँ
+क्यों
+कैसे
+सारे
+किसी
+दोनो
+प्रत्येक
+ज्यादा
+अधिकांश
+अन्य
+में कुछ
+ऐसा
+में कोई
+मात्र
+खुद
+समान
+इसलिए
+बहुत
+सकता
+जायेंगे
+जरा
+चाहिए
+अभी
+और
+कर दिया
+रखें
+का
+हैं
+इस
+होता
+करने
+ने
+बनी
+तो
+ही
+हो
+इसका
+था
+हुआ
+वाले
+बाद
+लिए
+सकते
+इसमें
+दो
+वे
+करते
+कहा
+वर्ग
+कई
+करें
+होती
+अपनी
+उनके
+यदि
+हुई
+जा
+कहते
+जब
+होते
+कोई
+हुए
+व
+जैसे
+सभी
+करता
+उनकी
+तरह
+उस
+आदि
+इसकी
+उनका
+इसी
+पे
+तथा
+भी
+परंतु
+इन
+कम
+दूर
+पूरे
+गये
+तुम
+मै
+यहां
+हुये
+कभी
+अथवा
+गयी
+प्रति
+जाता
+इन्हें
+गई
+अब
+जिसमें
+लिया
+बड़ा
+जाती
+तब
+उसे
+जाते
+लेकर
+बड़े
+दूसरे
+जाने
+बाहर
+स्थान
+उन्हें 
+गए
+ऐसे
+जिससे
+समय
+दोनों
+किए
+रहती
+इनके
+इनका
+इनकी
+सकती
+आज
+कल
+जिन्हें
+जिन्हों
+तिन्हें
+तिन्हों
+किन्हों
+किन्हें
+इत्यादि
+इन्हों
+उन्हों
+बिलकुल
+निहायत
+इन्हीं
+उन्हीं
+जितना
+दूसरा
+कितना
+साबुत
+वग़ैरह
+कौनसा
+लिये
+दिया
+जिसे
+तिसे
+काफ़ी
+पहले
+बाला
+मानो
+अंदर
+भीतर
+पूरा
+सारा
+उनको
+वहीं
+जहाँ
+जीधर
+के
+एवं
+कुछ
+कुल
+रहा
+जिस
+जिन
+तिस
+तिन
+कौन
+किस
+संग
+यही
+बही
+उसी
+मगर
+कर
+मे
+एस
+उन
+सो
+अत
@@ -1,4 +1,18 @@
 import pandas as pd
+import json
+import re
+from nltk.tokenize import word_tokenize
+from sklearn.feature_extraction.text import TfidfVectorizer
+from scipy.sparse import hstack
+from sklearn.preprocessing import LabelEncoder
+import subprocess
+
+# Function to run another Python script
+def run_script(script_name):
+    subprocess.check_call(['python', script_name])
+
+# Call the script to ensure NLTK data is available
+run_script('download_nltk_data.py')
 
 # Select environment (TEST/PROD)
 ENVIRONMENT = "PROD"
@@ -13,3 +27,58 @@
 # Load the storybooks
 storybooks_pd = pd.read_csv(RAW_DATA_DIR + "/storybooks.csv")
 print(f"storybooks_pd: \n{storybooks_pd}")
+
+#Each chapter details are stored in the chapters column of the dataframe in JSON format, hence extracting only paragraphs of the book.
+def extract_chapters_text(chapters_json):
+    try:
+        chapters = json.loads(chapters_json)
+        return ' '.join(paragraph['originalText'] for chapter in chapters for paragraph in chapter.get('storyBookParagraphs', []))
+    except (TypeError, json.JSONDecodeError):
+        return ''
+
+#Now the 'combined_chapters_text' column contains all the chapter paragraphs
+storybooks_pd['combined_chapters_text'] = storybooks_pd['chapters'].apply(extract_chapters_text)
+print(f"storybooks_pd_new: \n{storybooks_pd}")
+
+#Removing stop words
+def load_stopwords(filepath):
+    with open(filepath, 'r', encoding='utf-8') as file:
+        stopwords = set(file.read().splitlines())
+    return stopwords
+
+RAW_HINDI_STOPWORDS = "./env-" + ENVIRONMENT + "/lang-" + LANGUAGE
+
+# Define stopwords file path
+stopwords_file_path = RAW_HINDI_STOPWORDS + "/hindi_stopwords.txt"
+hindi_stopwords = load_stopwords(stopwords_file_path)
+
+#Preprocess the data
+def preprocess_text(text):
+    # Remove numbers
+    text = re.sub(r'\d+', '', text)
+    # Remove punctuation
+    text = re.sub(r'[^\w\s]', '', text)
+    # Tokenize
+    tokens = word_tokenize(text)
+    # Remove stopwords
+    tokens = [word for word in tokens if word not in hindi_stopwords]
+    # Join tokens back into a string
+    return ' '.join(tokens)
+
+# Apply preprocessing to each relevant column
+storybooks_pd['preprocessed_title'] = storybooks_pd['title'].apply(preprocess_text)
+storybooks_pd['preprocessed_description'] = storybooks_pd['description'].apply(preprocess_text)
+storybooks_pd['preprocessed_combined_chapters_text'] = storybooks_pd['combined_chapters_text'].apply(preprocess_text)
+
+# Vectorization using TF-IDF
+tfidf_vectorizer = TfidfVectorizer()
+title_vectors = tfidf_vectorizer.fit_transform(storybooks_pd['preprocessed_title'])
+description_vectors = tfidf_vectorizer.fit_transform(storybooks_pd['preprocessed_description'])
+chapters_vectors = tfidf_vectorizer.fit_transform(storybooks_pd['preprocessed_combined_chapters_text'])
+
+# Combine features(Used for training the ML model)
+combined_features = hstack([title_vectors, description_vectors, chapters_vectors])
+
+#reading_level' is the target variable
+label_encoder = LabelEncoder()
+y = label_encoder.fit_transform(storybooks_pd['reading_level'])
@@ -1 +1,7 @@
 pandas==2.2.2
+numpy==2.0.1
+nltk==3.8.2
+pytz==2024.1
+regex==2024.7.24
+scikit-learn==1.5.1
+scipy==1.14.0