diff --git a/setup.py b/setup.py index a42f59f..b9a1064 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,7 @@ def read(*names, **kwargs): "nbqa", "flake8", "jupytext", + "codespell", # ensure we have a valid IPython version since # black needs it "ipython<=8.12.0; python_version <= '3.8'", diff --git a/src/pkgmt/cli.py b/src/pkgmt/cli.py index b5fcf24..94a9ff3 100644 --- a/src/pkgmt/cli.py +++ b/src/pkgmt/cli.py @@ -203,6 +203,3 @@ def lint(files, exclude): if returncode: raise SystemExit("Error linting") - -if __name__ == '__main__': - cli() \ No newline at end of file diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 0000000..b2cb7d5 --- /dev/null +++ b/test.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","metadata":{"id":"QUdOQKBjalTs"},"source":["**# Install the library Urduhack: A Python NLP library for Urdu language**"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"7hDKYcogbFzg"},"outputs":[],"source":["!pip install Urduhack"]},{"cell_type":"markdown","metadata":{"id":"hsILf-3dbdIV"},"source":["**# Import the necessary libraries**"]},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3055,"status":"ok","timestamp":1702895856151,"user":{"displayName":"Khubaib Ahmad","userId":"13992953376961780846"},"user_tz":-300},"id":"t3JWEU9kaJhS","outputId":"9e92627f-cfc4-4fc1-d4b5-e06b692e9b24"},"outputs":[{"name":"stderr","output_type":"stream","text":["/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/tensorflow_addons/utils/tfa_eol_msg.py:23: UserWarning: \n","\n","TensorFlow Addons (TFA) has ended development and introduction of new features.\n","TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.\n","Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). \n","\n","For more information see: https://github.com/tensorflow/addons/issues/2807 \n","\n"," warnings.warn(\n","[nltk_data] Downloading package punkt to\n","[nltk_data] /Users/danial.shabbir/nltk_data...\n","[nltk_data] Package punkt is already up-to-date!\n"]}],"source":["from tqdm import tqdm\n","import os\n","import re\n","import calendar\n","import numpy as np\n","import pandas as pd\n","import pickle\n","from collections import Counter\n","import urduhack\n","import nltk\n","from nltk.tokenize import word_tokenize\n","nltk.download('punkt')\n","import json\n","import itertools\n","from typing import List\n","from sklearn.metrics import confusion_matrix, precision_score, recall_score\n","# Download UrduHack resources\n","urduhack.download()\n","from urduhack.normalization import normalize\n","from urduhack.preprocessing import normalize_whitespace, remove_punctuation, remove_accents, replace_urls, replace_emails, replace_numbers, replace_currency_symbols, remove_english_alphabets"]},{"cell_type":"markdown","metadata":{"id":"lf6xJ8mOaRRz"},"source":["**# Load the dataset from your google drive directory**"]},{"cell_type":"code","execution_count":78,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":67280,"status":"ok","timestamp":1702891465908,"user":{"displayName":"Khubaib Ahmad","userId":"13992953376961780846"},"user_tz":-300},"id":"UrJmeCOrIyd2","outputId":"1fd7d5ac-43db-4649-ce33-aa89d9789d56"},"outputs":[],"source":["# from google.colab import drive\n","# drive.mount('/content/drive')\n","# os.chdir('/content/drive/MyDrive/Colab Notebooks/Khubaib')\n","# import pandas as pd\n","\n","\n","data = pd.read_csv('data/urdu-news-dataset-1M.csv', index_col='Index', encoding='unicode_escape')"]},{"cell_type":"markdown","metadata":{"id":"GPVSV7XkK1vW"},"source":["**# Data preprocessing**"]},{"cell_type":"code","execution_count":79,"metadata":{"id":"TIeey5R-PH8C"},"outputs":[],"source":["# Drop null values\n","data = data.dropna()\n","data.reset_index(drop=True, inplace=True)"]},{"cell_type":"code","execution_count":80,"metadata":{"id":"Xm0w7zAbKwP2"},"outputs":[],"source":["# Function to convert encodings\n","def encodings_change(series):\n"," try:\n"," return [text.encode('latin1').decode('utf-8') for text in series]\n"," except:\n"," return series"]},{"cell_type":"code","execution_count":81,"metadata":{"id":"RMC_roiALETh"},"outputs":[],"source":["# Function to update date format\n","english_month_name = [i.lower() for i in list(calendar.month_name)]\n","urdu_month_map = ['',\n"," 'جنوری',\n"," 'فروری',\n"," 'مارچ',\n"," 'اپريل',\n"," 'مئی',\n"," 'جون',\n"," 'جولائی',\n"," 'اگست',\n"," 'ستمبر',\n"," 'اکتوبر',\n"," 'نومبر',\n"," 'دسمبر',\n"," '','','',\n"," 'اپریل',\n"," 'مئ','',\n"," 'جولائ',\n"," '','','','',''\n"," ]\n","\n","short_month_name = ['','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug',\n"," 'Sep','Oct','Nov','Dec']\n","\n","def date_encoding(date,urdu_month_map = urdu_month_map,\n"," english_month_name = english_month_name,\n"," short_month_name = short_month_name):\n","\n"," month = None\n"," if bool(re.match('\\d\\d\\d\\d\\-\\d\\d\\-\\d\\d',str(date))):\n"," return date\n"," if '/'in date:\n"," date = date.split('/')\n"," year = date[-1]\n"," month = date[0]\n"," day = date[1]\n"," date = f'{year}-{month}-{day}'\n"," return date\n","\n"," for i in date.encode('latin1').decode('utf-8').split():\n"," if ',' in i:\n"," i = i.replace(',','')\n"," if i in urdu_month_map:\n"," month = urdu_month_map.index(i)\n"," if month > 12:\n"," month = month - 12\n","\n"," elif i.lower() in english_month_name:\n"," month = english_month_name.index(i.lower())\n","\n"," elif i.lower() in short_month_name:\n"," month = short_month_name.index(i.lower())\n","\n"," if len(i)==4:\n"," year = i\n","\n"," if len(i)<=2:\n"," day = i\n","\n"," date = f'{year}-{month}-{day}'\n"," return date"]},{"cell_type":"code","execution_count":82,"metadata":{"id":"5baU_vQoLghD"},"outputs":[],"source":["def date_update(series):\n"," all_dates = []\n","\n"," for i,date in enumerate(series):\n"," if 'hours' in str(date):\n"," all_dates.append(np.nan)\n"," continue\n"," all_dates.append(date_encoding(date))\n"," return all_dates"]},{"cell_type":"code","execution_count":83,"metadata":{"id":"GHZcX0YALmpD"},"outputs":[],"source":["# Apply encodings_change and date_encoding to relevant columns\n","data['News Text'] = encodings_change(data['News Text'])\n","data['Headline'] = encodings_change(data['Headline'])\n","data['Date'] = date_update(data['Date'])"]},{"cell_type":"code","execution_count":84,"metadata":{},"outputs":[],"source":["# data = data.query('Headline.str.contains(\"خیبرپختونخوا\")')\\\n","# # .groupby('Category',as_index = False,group_keys=False).apply(lambda s: s.sample(2))\n","# #data.head()\n","# # data.head()\n"]},{"cell_type":"code","execution_count":85,"metadata":{"id":"ChzPzRiCR2AI"},"outputs":[],"source":["#save clean dataset\n","data_file = open('data/Urdu_News', 'wb')\n","pickle.dump(data, data_file)\n","data_file.close()"]},{"cell_type":"code","execution_count":86,"metadata":{"id":"IGHRNmsrV0NY"},"outputs":[],"source":[" # remove stop words\n","stop_words = frozenset(\"\"\"\n","آ آئی آئیں آئے آتا آتی آتے آداب آدھ آدھا آدھی آدھے آس\n"," آمدید آنا آنسہ آنی آنے آپ آگے آہ آہا آیا اب ابھی ابے\n"," اتوار ارب اربویں ارے اس اسکا اسکی اسکے اسی اسے اف افوہ الاول البتہ\n"," الثانی الحرام السلام الف المکرم ان اندر انکا انکی انکے انہوں انہی انہیں\n"," اوئے اور اوپر اوہو اپ اپنا اپنوں اپنی اپنے اپنےآپ اکبر اکثر اگر اگرچہ\n"," اگست اہاہا ایسا ایسی ایسے ایک بائیں بار بارے بالکل باوجود باہر بج بجے\n"," بخیر برسات بشرطیکہ بعض بغیر بلکہ بن بنا بناؤ بند بڑی بھر بھریں\n"," بھی بہار بہت بہتر بیگم تاکہ تاہم تب تجھ تجھی تجھے ترا تری\n"," تلک تم تمام تمہارا تمہاروں تمہاری تمہارے تمہیں تو تک تھا تھی تھیں تھے\n"," تہائی تیرا تیری تیرے تین جا جاؤ جائیں جائے جاتا جاتی جاتے جانی جانے\n"," جب جبکہ جدھر جس جسے جن جناب جنہوں جنہیں جو جہاں جی جیسا\n"," جیسوں جیسی جیسے جیٹھ حالانکہ حالاں حصہ حضرت خاطر خالی خدا خزاں خواہ خوب\n"," خود دائیں درمیان دریں دو دوران دوسرا دوسروں دوسری دوشنبہ دوں دکھائیں دگنا دی\n"," دیئے دیا دیتا دیتی دیتے دیر دینا دینی دینے دیکھو دیں دیے دے ذریعے\n"," رکھا رکھتا رکھتی رکھتے رکھنا رکھنی رکھنے رکھو رکھی رکھے رہ رہا رہتا\n"," رہتی رہتے رہنا رہنی رہنے رہو رہی رہیں رہے ساتھ سامنے ساڑھے سب سبھی\n"," سراسر سلام سمیت سوا سوائے سکا سکتا سکتے سہ سہی سی سے شام شاید\n"," شکریہ صاحب صاحبہ صرف ضرور طرح طرف طور علاوہ عین فروری فقط فلاں\n"," فی قبل قطا لائی لائے لاتا لاتی لاتے لانا لانی لایا لو لوجی لوگوں\n"," لگ لگا لگتا لگتی لگی لگیں لگے لہذا لی لیا لیتا لیتی لیتے لیکن\n"," لیں لیے لے ماسوا مت مجھ مجھی مجھے محترم محترمی محض مرا مرحبا\n"," مری مرے مزید مس مسز مسٹر مطابق مطلق مل منٹ منٹوں مکرمی مگر\n"," مگھر مہربانی میرا میروں میری میرے میں نا نزدیک نما نو نومبر نہ نہیں\n"," نیز نیچے نے و وار واسطے واقعی والا والوں والی والے واہ وجہ ورنہ\n"," وعلیکم وغیرہ ولے وگرنہ وہ وہاں وہی وہیں ویسا ویسے ویں پاس\n"," پایا پر پس پلیز پون پونا پونی پونے پھاگن پھر پہ پہر پہلا پہلی\n"," پہلے پیر پیچھے چاہئے چاہتے چاہیئے چاہے چلا چلو چلیں چلے چناچہ چند چونکہ\n"," چوگنی چکی چکیں چکے چہارشنبہ چیت ڈالنی ڈالنے ڈالے کئے کا کاتک کاش کب\n"," کبھی کدھر کر کرتا کرتی کرتے کرم کرنا کرنے کرو کریں کرے کس\n"," کسی کسے کل کم کن کنہیں کو کوئی کون کونسا کونسے کچھ کہ کہا\n"," کہاں کہہ کہی کہیں کہے کی کیا کیسا کیسے کیونکر کیونکہ کیوں کیے کے\n"," گئی گئے گا گرما گرمی گنا گو گویا گھنٹا گھنٹوں گھنٹے گی گیا\n"," ہائیں ہائے ہاڑ ہاں ہر ہرچند ہرگز ہزار ہفتہ ہم ہمارا ہماری ہمارے ہمی\n"," ہمیں ہو ہوئی ہوئیں ہوئے ہوا ہوبہو ہوتا ہوتی ہوتیں ہوتے ہونا ہونگے ہونی\n"," وہاں یہاں کہاں ہم ہر ہوۓ ہوئ ہوا ہوگا ہوگی ہوں گے میں کو تھا تھی تھے ہے سے اور اس ان اسے انہوں انہیں تک تم تو کا کی کے نا نے گا گی گے آئ آۓ گۓ گئ گیا جاتی جاتے جاتا چکا چکی چکے دیا دیۓ دیتا دیتے رک رکی رکا رکے سکی سکا سکے ہونے ہوں ہی ہیلو ہیں ہے یا یات یعنی یک یہ یہاں یہی یہیں\n","\"\"\".split())"]},{"cell_type":"code","execution_count":87,"metadata":{"id":"oodPMLHzWUcQ"},"outputs":[],"source":["def remove_stopwords(text: str):\n"," return \" \".join(word for word in text.split() if word not in stop_words)\n","def preprocess_stopwords(text):\n"," text = remove_stopwords(text)\n"," return text\n","def apply_stopwords_preprocess(series):\n"," preprocessed_stopword = []\n"," for text in series:\n"," preprocessed_stopword.append(preprocess_stopwords(text))\n"," return preprocessed_stopword"]},{"cell_type":"code","execution_count":88,"metadata":{"id":"TMu1_zWAWopr"},"outputs":[],"source":["data['Headline'] = apply_stopwords_preprocess(data['Headline'])\n","data['Headline'] = data['Headline'].apply(normalize)\n","data['Headline'] = data['Headline'].apply(remove_accents)\n","data['Headline'] = data['Headline'].apply(replace_urls)\n","data['Headline'] = data['Headline'].apply(replace_emails)\n","data['Headline'] = data['Headline'].apply(replace_currency_symbols)\n","data['Headline'] = data['Headline'].apply(normalize_whitespace)\n","data['Headline'] = data['Headline'].apply(remove_punctuation)\n","data['Headline'] = data['Headline'].apply(replace_numbers)\n","data['Headline'] = data['Headline'].apply(remove_english_alphabets)"]},{"cell_type":"code","execution_count":89,"metadata":{"id":"Uz0hUOjoXyGw"},"outputs":[],"source":["def removing_unwanted_data(text):\n","\n"," # Format words and remove unwanted characters from news headlines\n"," text = re.sub(r'https?:\\/\\/.*[\\r\\n]*', '', text, flags=re.MULTILINE)\n"," text = re.sub(r'\\', ' ', text)\n"," text = re.sub(r'\\'', ' ', text)\n"," return text\n","data['Headline']= list(map(removing_unwanted_data,data.Headline))"]},{"cell_type":"markdown","metadata":{"id":"DodGGrB6dqoO"},"source":["**# Divide dataset into headlines that are related to user query and those headlines that are not related to user query**"]},{"cell_type":"code","execution_count":90,"metadata":{"id":"xnYPgI07Yk9t"},"outputs":[],"source":["#consider the entertainment as negative topic and the other as positive docs.\n","positive_headlines = []\n","business_positive_headlines = data['Headline'][(data['Category'] == 'Business & Economics')][0:19309]\n","science_positive_headlines = data['Headline'][(data['Category'] == 'Science & Technology')][0:6400]\n","sports_positive_headlines = data['Headline'][(data['Category'] == 'Sports')][0:35870]\n","negative_headlines = data['Headline'][data['Category'] == 'Entertainment'][0:27930]\n","#concatenate them into 80% training data\n","positive_headlines = pd.concat([business_positive_headlines, science_positive_headlines, sports_positive_headlines], axis=0)"]},{"cell_type":"markdown","metadata":{"id":"YKekSrF3dMQz"},"source":["**# Features extraction**"]},{"cell_type":"code","execution_count":91,"metadata":{"id":"hM1TPi-mc6NN"},"outputs":[],"source":["positive_unique_words = set()\n","positive_headlines.str.lower().str.split().apply(positive_unique_words.update)\n","positive_unique_words = list(positive_unique_words)"]},{"cell_type":"markdown","metadata":{"id":"8L-5ZtaUlMLU"},"source":["**# Apply pattern taxonomy model (PTM)**"]},{"cell_type":"code","execution_count":92,"metadata":{"id":"aAZ4hLHlfotM"},"outputs":[],"source":["#assign each head a number.\n","positive_titles_refs = [f'dp{i+1}' for i in positive_headlines.index]"]},{"cell_type":"code","execution_count":93,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":441326,"status":"ok","timestamp":1702896320269,"user":{"displayName":"Khubaib Ahmad","userId":"13992953376961780846"},"user_tz":-300},"id":"QWmZUqQBf33c","outputId":"f787e68a-bda6-46bb-a97f-663149f598f7"},"outputs":[{"name":"stderr","output_type":"stream","text":["100%|██████████| 61579/61579 [01:35<00:00, 645.96it/s] \n"]}],"source":["#assign each ref to its words.\n","#example:- dp1: ['he', 'is', 'mad']\n","set_of_paragraphs = {}\n","for i in tqdm(range(len(positive_headlines))):\n"," #tokenize headline\n"," title = word_tokenize(positive_headlines.iloc[i])\n"," terms = []\n"," for j in range(len(title)):\n"," if title[j] in positive_unique_words:\n"," terms.append(positive_unique_words[positive_unique_words.index(title[j])])\n"," set_of_paragraphs[positive_titles_refs[i]] = terms"]},{"cell_type":"code","execution_count":94,"metadata":{"id":"SZt2yUnkh-B6"},"outputs":[],"source":["#get tokenized terms\n","positive_terms = list(set_of_paragraphs.values())\n","#get dps\n","positive_dp = list(set_of_paragraphs.keys())"]},{"cell_type":"code","execution_count":95,"metadata":{"id":"iND12vDYiEwC"},"outputs":[],"source":["#save positive paragraphs refs to use them later in pattern depolying step\n","model_file = open('Model/Urdu Positive dp', 'wb')\n","pickle.dump(positive_dp, model_file)\n","model_file.close()\n","\n","#save positive terms to use them later in pattern depolying step\n","model_file = open('Model/Urdu Positive terms', 'wb')\n","pickle.dump(positive_terms, model_file)\n","model_file.close()"]},{"cell_type":"markdown","metadata":{"id":"v7VjULuPjxCR"},"source":["**# Create patterns from positive terms**"]},{"cell_type":"code","execution_count":96,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"UbmI43nWiZMb","outputId":"19330bc2-6f0b-491a-ac04-fe3fdce0dab2"},"outputs":[{"name":"stderr","output_type":"stream","text":["100%|██████████| 61579/61579 [06:03<00:00, 169.36it/s] \n"]}],"source":["def create_combinations_of_terms(terms):\n"," #combinations list.\n"," all_combinations = []\n"," for i in tqdm(range(len(terms))):\n"," combinations = []\n"," for j in range(1, len(terms[i])+1):\n"," combinations.append(list(itertools.combinations(terms[i], j)))\n"," #flatten it since it's 2d list.\n"," combinations = [item for sublist in combinations for item in sublist]\n"," all_combinations.append(combinations)\n"," return all_combinations\n","positive_combinations = create_combinations_of_terms(positive_terms)\n","len(positive_combinations)\n","#flatten the whole list of patterns. 2d -> 1d\n","positive_patterns = [item for sublist in positive_combinations for item in sublist]"]},{"cell_type":"markdown","metadata":{"id":"Hqx3FiSajK4B"},"source":["**# Create covering sets for the positive patterns**"]},{"cell_type":"code","execution_count":97,"metadata":{"id":"D-tjWnCWii3G"},"outputs":[{"name":"stderr","output_type":"stream","text":[" 0%| | 0/335673740 [00:00 dict:\n"," covering_sets = {}\n"," for i in tqdm(range(len(patterns))):\n"," count = 0\n"," terms_dp = []\n"," for j in range(len(terms)):\n"," #if pattern in the terms list\n"," if set(patterns[i]) <= set(terms[j]):\n"," count += 1\n"," terms_dp.append(dp[j])\n"," #if the support of the pattern is greater than min_support then its frequent pattern\n"," if count / len(terms) >= min_support:\n"," covering_sets[tuple(patterns[i])] = terms_dp\n"," return covering_sets\n","min_support = 0.0001\n","positive_covering_sets = create_covering_sets(positive_patterns, positive_terms, positive_dp, min_support)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Ua7E4dO7kjcZ"},"outputs":[],"source":["#Save PTM dictionary.\n","model_file = open('Model/Urdu Positive Doc PTM', 'wb')\n","pickle.dump(positive_covering_sets, model_file)\n","model_file.close()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"YTX4VcMhklFN"},"outputs":[],"source":["#Open PTM dictionary.\n","with open('Model/Urdu Positive Doc PTM', 'rb') as f:\n"," positive_covering_sets = pickle.load(f)"]},{"cell_type":"markdown","metadata":{"id":"z4dkYhbWlw-X"},"source":["**# Apply closed sequential pattern mining algorithm to extract closed sequential patterns**"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"AlicapQzmKlI"},"outputs":[{"name":"stderr","output_type":"stream","text":["100%|██████████| 5398/5398 [00:05<00:00, 1003.15it/s]\n"]}],"source":["from tqdm import tqdm\n","closed_covering_sets = {}\n","covering_sets_keys = list(positive_covering_sets.keys())\n","# Create a tqdm instance with the total number of iterations\n","for i in tqdm(range(len(covering_sets_keys))):\n"," current_pattern = covering_sets_keys[i]\n"," is_closed = True\n"," for j in range(len(covering_sets_keys)):\n"," if i != j:\n"," other_pattern = covering_sets_keys[j]\n"," if set(current_pattern).issubset(other_pattern) and len(positive_covering_sets[current_pattern]) <= len(positive_covering_sets[other_pattern]):\n"," is_closed = False\n"," break\n"," if is_closed:\n"," closed_covering_sets[current_pattern] = positive_covering_sets[current_pattern]\n","\n","# Now, closed_covering_sets should contain the closed covering sets"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"gcG2WLtwmSFZ"},"outputs":[],"source":["#save clospan dictionary.\n","model_file = open('Model/Urdu Positive Doc Closed PTM', 'wb')\n","pickle.dump(closed_covering_sets, model_file)\n","model_file.close()"]},{"cell_type":"markdown","metadata":{"id":"_EQb32BPmlKK"},"source":["**# Now, features extraction from negative docs**"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"9B1-sUBFnMXE"},"outputs":[{"name":"stderr","output_type":"stream","text":["100%|██████████| 2/2 [00:00<00:00, 9754.20it/s]\n"]}],"source":["#Negative docs title refrences and negative docs titles dictionaries.\n","negative_titles_refs = [f'dp{i+1}' for i in negative_headlines.index]\n","titles = []\n","for i in tqdm(range(len(negative_headlines))):\n"," title = word_tokenize(negative_headlines.iloc[i])\n"," titles.append(title)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"I5JKrcmUnWiq"},"outputs":[],"source":["negative_doc_weights = {}\n","negative_doc_titles = {}\n","for i in range(len(titles)):\n"," summ = []\n"," for word in titles[i]:\n"," summ.append(sum([row.count(word) for row in titles]) / len(titles))\n"," negative_doc_weights[negative_titles_refs[i]] = sum(summ)\n"," negative_doc_titles[negative_titles_refs[i]] = titles[i]"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Ln1L3Gg-oOXS"},"outputs":[],"source":["#save weights of the negative docs.\n","model_file = open('Model/Urdu Negative_doc_weights', 'wb')\n","pickle.dump(negative_doc_weights, model_file)\n","model_file.close()\n","model_file = open('Model/Urdu Negative_doc_titles', 'wb')\n","pickle.dump(negative_doc_titles, model_file)\n","model_file.close()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"JQV_T0iLo0m9"},"outputs":[],"source":["# Load required data and files\n","with open('data/Urdu_News', 'rb') as f:\n"," test_data = pickle.load(f)\n","with open('Model/Urdu Positive Doc Closed PTM', 'rb') as f:\n"," closed_positive_covering_sets = pickle.load(f)\n","with open('Model/Urdu Positive dp', 'rb') as f:\n"," positive_dp = pickle.load(f)\n","with open('Model/Urdu Positive terms', 'rb') as f:\n"," positive_terms = pickle.load(f)\n","with open('Model/Urdu Negative_doc_weights', 'rb') as f:\n"," negative_doc_weights = pickle.load(f)\n","with open('Model/Urdu Negative_doc_titles', 'rb') as f:\n"," negative_doc_titles = pickle.load(f)"]},{"cell_type":"markdown","metadata":{"id":"03jejSIJSaaF"},"source":["**# Patterns deployment**"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"cmTjvwauQniZ"},"outputs":[],"source":["positive_temp_covering_set = {}\n","def create_empty_covering_set(list_dps):\n"," temp_covering_set = {}\n"," for i in range(len(list_dps)):\n"," temp_covering_set[list_dps[i]] = []\n"," return temp_covering_set\n","positive_temp_covering_set = create_empty_covering_set(positive_dp)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"GIEIaWcaTuah"},"outputs":[],"source":["# d_patterns creation\n","def create_d_patterns(closed_covering_sets, empty_covering_set):\n","\n"," for key, value in closed_covering_sets.items():\n"," for i in range(len(value)):\n"," empty_covering_set[value[i]].append(key)\n"," return empty_covering_set\n","positive_temp_covering_set = create_d_patterns(closed_positive_covering_sets, positive_temp_covering_set)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"DIxR2O-iT-JH"},"outputs":[],"source":["def get_support_of_each_term(temp_covering_set):\n"," temp_values = []\n"," new_temp_covering_set = {}\n"," for key, value in temp_covering_set.items():\n"," if value != []:\n"," tmp_lst = [j for i in value for j in i]\n"," new_temp_covering_set[key] = dict(Counter(tmp_lst))\n"," temp_values.append(new_temp_covering_set[key])\n"," return new_temp_covering_set, temp_values\n","positive_terms_supports, positive_temp_values = get_support_of_each_term(positive_temp_covering_set)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"8eUZ4XAzUHpg"},"outputs":[],"source":["import copy\n","#create a deep copy of the positive terms support.\n","none_normalized_positive_d_pattern = copy.deepcopy(positive_terms_supports)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"i21jHXM0UTQB"},"outputs":[],"source":["# normalize d_patterns\n","def create_normalized_d_patterns(closed_covering_Set,terms_supports):\n"," \"\"\"\n"," This function is used to normalize the support of each pattern.\n"," ex:- {dp1: {'t1':1, 't2':2, 't3':3}} -> {dp1: {'t1':1/6, 't2':2/6, 't3':3/6}}\n"," \"\"\"\n"," for i in range(len(terms_supports)):\n"," summ = sum(list(terms_supports[i].values()))\n"," for key, value in terms_supports[i].items():\n"," terms_supports[i][key] = value * (1 / summ)\n"," i = 0\n"," for key, value in closed_covering_Set.items():\n"," closed_covering_Set[key] = terms_supports[i]\n"," i += 1\n"," return closed_covering_Set\n","normalized_positive_d_pattern = create_normalized_d_patterns(positive_terms_supports, positive_temp_values)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"082FgzYzUkYi"},"outputs":[],"source":["supports = list(none_normalized_positive_d_pattern.values())\n","sum_of_supports = []\n","for key, value in none_normalized_positive_d_pattern.items():\n"," sum_of_supports.append(sum(list(value.values())))\n","Threshold = min(sum_of_supports)"]},{"cell_type":"markdown","metadata":{"id":"0ih3RIrjVAvp"},"source":["**# Shuffling Algorithm**"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ZGYccRmFVK78"},"outputs":[],"source":["def shuffling_algorithm(nd: dict, norm_d_patterns:dict, mu:int) -> dict:\n"," \"\"\"\n"," This function is used to shuffle the supports of terms in dp if there are a noise term in the dp or remove the whole dp if all terms in the negative pattern.\n"," Params:-\n"," nd:- similar terms between positive and negative patterns.\n"," norm_d_patterns:- normalized positive d-pattern.\n"," mu:- constant value.\n"," \"\"\"\n"," keys = list(norm_d_patterns.keys())\n"," for key in keys:\n"," if key in nd:\n"," offering = 0\n"," base = 0\n"," if sorted(nd[key]) == sorted(list(norm_d_patterns[key].keys())):\n","\n"," del norm_d_patterns[key]\n"," else:\n"," supports = np.array(list(norm_d_patterns[key].values()))\n","\n"," dps = list(norm_d_patterns[key].keys())\n"," in_indices = [i for i, x in enumerate(dps) if x in nd[key]]\n"," out_indices = [i for i, x in enumerate(dps) if x not in nd[key]]\n"," sum_of_offering_supports = np.sum(supports[in_indices])\n"," sum_of_base_supports = np.sum(supports[out_indices])\n"," offering = (1 - (1/mu)) * sum_of_offering_supports\n"," base = sum_of_base_supports\n","\n"," for term in norm_d_patterns[key].keys():\n"," if term in nd[key]:\n"," norm_d_patterns[key][term] = (1/ mu) * norm_d_patterns[key][term]\n"," else:\n"," norm_d_patterns[key][term] = (1 + offering / base) * norm_d_patterns[key][term]\n"," return norm_d_patterns"]},{"cell_type":"markdown","metadata":{"id":"ULXDrvGfViAX"},"source":["**# IPEvolving Algorithm**"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"3A9NL3E9Vqul"},"outputs":[],"source":["negative_doc_titles_keys = list(negative_doc_titles.keys())\n","for key in negative_doc_titles_keys:\n"," #if the weight of the negaitve pattern >= TAHRESHOLD\n"," if negative_doc_weights[key] >= Threshold:\n"," nd = {}\n"," #get the terms in each dp\n"," normalized_positive_d_pattern_keys = list(normalized_positive_d_pattern.keys())\n"," for key_2 in normalized_positive_d_pattern_keys:\n"," #if there similar terms between positive patterns and negative patterns.\n"," if list(set(normalized_positive_d_pattern[key_2].keys()) & set(negative_doc_titles[key])) != []:\n"," #add these similar terms to the nd.\n"," nd[key_2] = list(set(normalized_positive_d_pattern[key_2].keys()) & set(negative_doc_titles[key]))\n"," #do shuffling.\n"," normalized_positive_d_pattern = shuffling_algorithm(nd, normalized_positive_d_pattern, 5)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"HRL_RrKhV4IB"},"outputs":[],"source":["# save updated d patterns\n","model_file = open('Model/d_patterns', 'wb')\n","pickle.dump(normalized_positive_d_pattern, model_file)\n","model_file.close()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"flx5Kq9nWP5C"},"outputs":[],"source":["# open updated d patterns\n","with open('Model/d_patterns', 'rb') as f:\n"," data_patterns = pickle.load(f)"]},{"cell_type":"markdown","metadata":{"id":"IS0l7sW2XFah"},"source":["**# Test pattern mining algorithms**"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Ujth4cqKWu4K"},"outputs":[],"source":["# function to get headlines from testing data that are related to user query and contain updated d patterns\n","def search_for_query(query, data_patterns):\n"," \"\"\"\n"," This function searches for query in the normalized_positive_d_pattern and if it exists it will return the top paragraphs indecis.\n"," \"\"\"\n"," all_supports = {}\n"," words = word_tokenize(query)\n"," for key, value in data_patterns.items():\n"," dp_support = 0\n"," for word in words:\n"," if word in list(value.keys()):\n"," dp_support += value[word]\n"," all_supports[key] = dp_support\n"," #sort supports\n"," all_supports = dict(sorted(all_supports.items(), key=lambda item: item[1], reverse = True))\n"," results = list(all_supports.keys())\n"," final_supports = list(all_supports.values())\n"," #check if final supports has supports > 0\n"," check = all(v == 0 for v in final_supports)\n"," top_paragraphs = []\n"," #if all final supports has 0 values\n"," if check:\n"," print(\"ہم معزرت خواہ ہیں۔ آپ کے استفسار کے مطابق سسٹم میں ڈاکومنٹس موجود نہیں ہیں۔ یہ سسٹم محدود ڈیٹا سیٹ پر تیار کیا گیا ہے۔\")\n"," else:\n","\n"," for i in range(len(results)):\n"," if final_supports[i] >= 0.3:\n"," top_paragraphs.append(int(results[i][results[i].index('p')+1:]) -1)\n"," return top_paragraphs"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"xFmfCMYlYxf4"},"outputs":[],"source":["# function to print the retrieved result\n","def print_retrieved_queries(query, test_data,top_paragraphs):\n"," if top_paragraphs == []:\n"," print(\"ہم معزرت خواہ ہیں۔ آپ کے استفسار کے مطابق سسٹم میں ڈاکومنٹس موجود نہیں ہیں۔ یہ سسٹم محدود ڈیٹا سیٹ پر تیار کیا گیا ہے۔\")\n"," else:\n"," print(f\"Documents retrieved for query: {query}\")\n"," for i in range(len(top_paragraphs)):\n"," print(f\"Page {i+1}, Date {test_data['Date'].iloc[top_paragraphs[i]]}\")\n"," print(f\"Category: {test_data['Category'].iloc[top_paragraphs[i]]}\")\n"," print(f\"Source: {test_data['Source'].iloc[top_paragraphs[i]]}, {data['URL'].iloc[top_paragraphs[i]]}\")\n"," print(\"------------------------------------------------------Headline--------------------------------------------------------\")\n"," print(test_data['Headline'].iloc[top_paragraphs[i]])\n"," print(\"------------------------------------------------------News Text--------------------------------------------------------\")\n"," print(test_data['News Text'].iloc[top_paragraphs[i]])\n"," print(\"\\n\\n\")"]},{"cell_type":"markdown","metadata":{"id":"LqOaD2rkZbCG"},"source":["**# Function to calculate precision and recall**"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"exhWlc-GZulj"},"outputs":[],"source":["def compute_precision_recall(retrieved_categories, intended_category, total_relevant):\n"," # Count the number of retrieved documents that are relevant\n"," relevant_retrieved = sum([1 for cat in retrieved_categories if cat == intended_category])\n"," #print(relevant_retrieved)\n"," # Precision: fraction of retrieved documents that are relevant\n"," precision = relevant_retrieved / len(retrieved_categories) if retrieved_categories else 0\n"," # Recall: fraction of the total relevant documents that were retrieved\n"," recall = relevant_retrieved / total_relevant if total_relevant else 0\n"," return precision, recall"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["metrics = {\n"," \"Business & Economics\":{},\n"," \"Science & Technology\":{},\n"," \"Sports\":{}\n","}"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"3mWBxHObZUKD"},"outputs":[{"name":"stdout","output_type":"stream","text":["Documents retrieved for query: خیبرپختونخوا\n"]},{"ename":"IndexError","evalue":"single positional indexer is out-of-bounds","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)","Cell \u001b[0;32mIn[77], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m query \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mخیبرپختونخوا\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;66;03m#input(\"Enter query: \")\u001b[39;00m\n\u001b[1;32m 3\u001b[0m top_paragraphs \u001b[38;5;241m=\u001b[39m search_for_query(query, data_patterns)\n\u001b[0;32m----> 4\u001b[0m \u001b[43mprint_retrieved_queries\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m \u001b[49m\u001b[43m,\u001b[49m\u001b[43mtest_data\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_paragraphs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 5\u001b[0m query_intended_category \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBusiness & Economics\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;66;03m# This should be set based on the query's intended category\u001b[39;00m\n\u001b[1;32m 6\u001b[0m retrieved_categories \u001b[38;5;241m=\u001b[39m [test_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mCategory\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39miloc[i] \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m top_paragraphs]\n","Cell \u001b[0;32mIn[73], line 8\u001b[0m, in \u001b[0;36mprint_retrieved_queries\u001b[0;34m(query, test_data, top_paragraphs)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDocuments retrieved for query: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mquery\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mlen\u001b[39m(top_paragraphs)):\n\u001b[0;32m----> 8\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPage \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m1\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, Date \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[43mtest_data\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mDate\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miloc\u001b[49m\u001b[43m[\u001b[49m\u001b[43mtop_paragraphs\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCategory: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtest_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mCategory\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39miloc[top_paragraphs[i]]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSource: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtest_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSource\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39miloc[top_paragraphs[i]]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdata[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mURL\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39miloc[top_paragraphs[i]]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n","File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/indexing.py:1103\u001b[0m, in \u001b[0;36m_LocationIndexer.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1100\u001b[0m axis \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maxis \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 1102\u001b[0m maybe_callable \u001b[38;5;241m=\u001b[39m com\u001b[38;5;241m.\u001b[39mapply_if_callable(key, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj)\n\u001b[0;32m-> 1103\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_getitem_axis\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmaybe_callable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m)\u001b[49m\n","File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/indexing.py:1656\u001b[0m, in \u001b[0;36m_iLocIndexer._getitem_axis\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 1653\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot index by location index with a non-integer key\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1655\u001b[0m \u001b[38;5;66;03m# validate the location\u001b[39;00m\n\u001b[0;32m-> 1656\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_integer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1658\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj\u001b[38;5;241m.\u001b[39m_ixs(key, axis\u001b[38;5;241m=\u001b[39maxis)\n","File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/indexing.py:1589\u001b[0m, in \u001b[0;36m_iLocIndexer._validate_integer\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 1587\u001b[0m len_axis \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj\u001b[38;5;241m.\u001b[39m_get_axis(axis))\n\u001b[1;32m 1588\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m len_axis \u001b[38;5;129;01mor\u001b[39;00m key \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m-\u001b[39mlen_axis:\n\u001b[0;32m-> 1589\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIndexError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msingle positional indexer is out-of-bounds\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n","\u001b[0;31mIndexError\u001b[0m: single positional indexer is out-of-bounds"]}],"source":["# precision and recall for Business & Economics\n","query = \"آسٹریلیا میں کھیلوں کا انفراسٹرکچر\"#input(\"Enter query: \")\n","top_paragraphs = search_for_query(query, data_patterns)\n","print_retrieved_queries(query ,test_data, top_paragraphs)\n","query_intended_category = \"Business & Economics\" # This should be set based on the query's intended category\n","retrieved_categories = [test_data['Category'].iloc[i] for i in top_paragraphs]\n","#print(retrieved_categories)\n","total_relevant_docs = len(test_data[test_data['Category'] == query_intended_category])\n","#print(total_relevant_docs)\n","precision, recall = compute_precision_recall(retrieved_categories, query_intended_category, total_relevant_docs)\n","print(f\"Precision for the query: {precision:.7f}\")\n","print(f\"Recall for the query: {recall:.7f}\")\n","print(\"---------------------------------------------------------------\")\n","\n","# gather metrics category wise\n","metrics['Business & Economics']['precision'] = precision\n","metrics['Business & Economics']['recall'] = recall\n"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"4NE7CuKXaips"},"outputs":[{"name":"stdout","output_type":"stream","text":["ہم معزرت خواہ ہیں۔ آپ کے استفسار کے مطابق سسٹم میں ڈاکومنٹس موجود نہیں ہیں۔ یہ سسٹم محدود ڈیٹا سیٹ پر تیار کیا گیا ہے۔\n","ہم معزرت خواہ ہیں۔ آپ کے استفسار کے مطابق سسٹم میں ڈاکومنٹس موجود نہیں ہیں۔ یہ سسٹم محدود ڈیٹا سیٹ پر تیار کیا گیا ہے۔\n","Precision for the query: 0.0000000\n","Recall for the query: 0.0000000\n","---------------------------------------------------------------\n"]}],"source":["# precision and recall for Science & Technology\n","query = \"آسٹریلیا میں کھیلوں کا انفراسٹرکچر\"#input(\"Enter query: \")\n","top_paragraphs = search_for_query(query, data_patterns)\n","print_retrieved_queries(query ,test_data, top_paragraphs)\n","query_intended_category = \"Science & Technology\" # This should be set based on the query's intended category\n","retrieved_categories = [test_data['Category'].iloc[i] for i in top_paragraphs]\n","#print(retrieved_categories)\n","total_relevant_docs = len(test_data[test_data['Category'] == query_intended_category])\n","#print(total_relevant_docs)\n","precision, recall = compute_precision_recall(retrieved_categories, query_intended_category, total_relevant_docs)\n","print(f\"Precision for the query: {precision:.7f}\")\n","print(f\"Recall for the query: {recall:.7f}\")\n","print(\"---------------------------------------------------------------\")\n","\n","# gather metrics category wise\n","metrics['Science & Technology']['precision'] = precision\n","metrics['Science & Technology']['recall'] = recall\n"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"OosySb_Cavxp"},"outputs":[{"name":"stdout","output_type":"stream","text":["ہم معزرت خواہ ہیں۔ آپ کے استفسار کے مطابق سسٹم میں ڈاکومنٹس موجود نہیں ہیں۔ یہ سسٹم محدود ڈیٹا سیٹ پر تیار کیا گیا ہے۔\n","ہم معزرت خواہ ہیں۔ آپ کے استفسار کے مطابق سسٹم میں ڈاکومنٹس موجود نہیں ہیں۔ یہ سسٹم محدود ڈیٹا سیٹ پر تیار کیا گیا ہے۔\n","Precision for the query: 0.0000000\n","Recall for the query: 0.0000000\n","---------------------------------------------------------------\n"]}],"source":["# precision and recall for Sports\n","query = \"آسٹریلیا میں کھیلوں کا انفراسٹرکچر \" #input(\"Enter query: \")\n","top_paragraphs = search_for_query(query, data_patterns)\n","print_retrieved_queries(query ,test_data, top_paragraphs)\n","query_intended_category = \"Sports\" # This should be set based on the query's intended category\n","retrieved_categories = [test_data['Category'].iloc[i] for i in top_paragraphs]\n","#print(retrieved_categories)\n","total_relevant_docs = len(test_data[test_data['Category'] == query_intended_category])\n","#print(total_relevant_docs)\n","precision, recall = compute_precision_recall(retrieved_categories, query_intended_category, total_relevant_docs)\n","print(f\"Precision for the query: {precision:.7f}\")\n","print(f\"Recall for the query: {recall:.7f}\")\n","print(\"---------------------------------------------------------------\")\n","\n","\n","# gather metrics category wise\n","metrics['Sports']['precision'] = precision\n","metrics['Sports']['recall'] = recall\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Metrics for categories\n"]},{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
categoriesprecisionrecall
0Business & Economics0.00.0
1Science & Technology0.00.0
2Sports0.00.0
\n","
"],"text/plain":[" categories precision recall\n","0 Business & Economics 0.0 0.0\n","1 Science & Technology 0.0 0.0\n","2 Sports 0.0 0.0"]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
categoriesprecisionrecalloccurenceweighted_precisionweighted_recall
0Business & Economics0.00.0100.00.0
1Science & Technology0.00.0100.00.0
2Sports0.00.0100.00.0
\n","
"],"text/plain":[" categories precision recall occurence weighted_precision \\\n","0 Business & Economics 0.0 0.0 10 0.0 \n","1 Science & Technology 0.0 0.0 10 0.0 \n","2 Sports 0.0 0.0 10 0.0 \n","\n"," weighted_recall \n","0 0.0 \n","1 0.0 \n","2 0.0 "]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["Overal Precision: 0.0\n","Overal Recall: 0.0\n"]}],"source":["import pandas as pd \n","print(\"Metrics for categories\")\n","df = pd.DataFrame(metrics).T.head().reset_index().rename(columns = {'index':'categories'})\n","display(df.head())\n","\n","# get total headlines which belongs to this category\n","def total_instance(category):\n"," return data[data['Category'] == category].shape[0]\n","\n","df['occurence'] = df['categories'].apply(total_instance)\n","df['weighted_precision'] = df['occurence'] * df['precision']\n","df['weighted_recall'] = df['occurence'] * df['recall']\n","display(df.head())\n","overal_precision = round(df['weighted_precision'].sum() / df['occurence'].sum(), 2)\n","overal_recall = round(df['weighted_recall'].sum() / df['occurence'].sum(), 2)\n","\n","print(f\"Overal Precision: {overal_precision}\")\n","print(f\"Overal Recall: {overal_recall}\")\n","# total instances of each category \n","\n"]},{"cell_type":"markdown","metadata":{"id":"SbYimIpxfdDq"},"source":["**# precision line graph for single word queries**\n","\n","* first calculate precision for 25 queries one by one\n","* then enter calculated precision values one by one in y list\n","\n","* repeate the same process for recall line graph\n","\n","* remember its a manual line graph not system generated\n","\n","\n","\n","\n","\n"]},{"cell_type":"code","execution_count":83,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":472},"executionInfo":{"elapsed":749,"status":"ok","timestamp":1702929430728,"user":{"displayName":"Khubaib Ahmad","userId":"13992953376961780846"},"user_tz":-300},"id":"o6g8R5kUbrLp","outputId":"d63dd061-9ff2-4948-a354-60aeaee018a2"},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["import matplotlib.pyplot as plt\n","x = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19', '20','21','22','23','24','25']\n","y = [0.99, 0.99,0.99, 0.98, 0.98, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.98, 0.99, 0.99, 0.99, 0.99, 0.98, 0.99, 0.99]\n","width = 1.5\n","fig, ax = plt.subplots()\n","# plot line chart\n","ax.plot(x, y,color='r', label='Precision')\n","ax.set_ylim(0, max(y)+0.1)# set the y-axis limit to a multiple of 0.1\n","plt.yticks([i/10 for i in range(11)]) # set yticks with step of 0.1\n","plt.scatter(x, y,color= 'red')\n","ax.set_xlabel('Queries')\n","ax.set_ylabel('Precision')\n","ax.set_title('Precision with single word queries')\n","ax.legend(loc='lower right')\n","# display the plot\n","plt.show()"]},{"cell_type":"markdown","metadata":{"id":"E2kPwoK6f6vn"},"source":["**# recall line graph for single word queries**"]},{"cell_type":"code","execution_count":84,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":472},"executionInfo":{"elapsed":1065,"status":"ok","timestamp":1702929557037,"user":{"displayName":"Khubaib Ahmad","userId":"13992953376961780846"},"user_tz":-300},"id":"sAkntlPzf_lP","outputId":"c02c40d5-dbbc-49a0-aedc-eceffad5350d"},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["import matplotlib.pyplot as plt\n","x = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19', '20','21','22','23','24','25']\n","y = [0.99, 0.99,0.99, 0.98, 0.98, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.98, 0.99, 0.99, 0.99, 0.99, 0.98, 0.99, 0.99]\n","width = 1.5\n","fig, ax = plt.subplots()\n","# plot line chart\n","ax.plot(x, y,color='g', label='Recall')\n","ax.set_ylim(0, max(y)+0.1)# set the y-axis limit to a multiple of 0.1\n","plt.yticks([i/10 for i in range(11)]) # set yticks with step of 0.1\n","plt.scatter(x, y,color= 'green')\n","ax.set_xlabel('Queries')\n","ax.set_ylabel('Recall')\n","ax.set_title('Recall with single word queries')\n","ax.legend(loc='lower right')\n","# display the plot\n","plt.show()"]},{"cell_type":"markdown","metadata":{"id":"2JiLCXlRirpM"},"source":["**# precision line graph for double word queries**\n","\n"]},{"cell_type":"code","execution_count":85,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":472},"executionInfo":{"elapsed":742,"status":"ok","timestamp":1702930178513,"user":{"displayName":"Khubaib Ahmad","userId":"13992953376961780846"},"user_tz":-300},"id":"QFSk0k_4iy0B","outputId":"1e43961c-487b-487c-d041-53de8411a87c"},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["import matplotlib.pyplot as plt\n","x = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19', '20','21','22','23','24','25']\n","y = [0.99, 0.99,0.99, 0.98, 0.98, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.98, 0.99, 0.99, 0.99, 0.99, 0.98, 0.99, 0.99]\n","width = 1.5\n","fig, ax = plt.subplots()\n","# plot line chart\n","ax.plot(x, y,color='b', label='Precision')\n","ax.set_ylim(0, max(y)+0.1)# set the y-axis limit to a multiple of 0.1\n","plt.yticks([i/10 for i in range(11)]) # set yticks with step of 0.1\n","plt.scatter(x, y,color= 'blue')\n","ax.set_xlabel('Queries')\n","ax.set_ylabel('Precision')\n","ax.set_title('Precision with double word queries')\n","ax.legend(loc='lower right')\n","# display the plot\n","plt.show()"]},{"cell_type":"markdown","metadata":{"id":"BeA19C8djImg"},"source":["**# recall line graph for double word queries**"]},{"cell_type":"code","execution_count":86,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":472},"executionInfo":{"elapsed":661,"status":"ok","timestamp":1702930440195,"user":{"displayName":"Khubaib Ahmad","userId":"13992953376961780846"},"user_tz":-300},"id":"GDgAbhq2jQ5g","outputId":"dd85ebee-ea2c-45b8-b466-a3be5a0d5bfb"},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["import matplotlib.pyplot as plt\n","x = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19', '20','21','22','23','24','25']\n","y = [0.99, 0.99,0.99, 0.98, 0.98, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.98, 0.99, 0.99, 0.99, 0.99, 0.98, 0.99, 0.99]\n","width = 1.5\n","fig, ax = plt.subplots()\n","# plot line chart\n","ax.plot(x, y,color='black', label='Recall')\n","ax.set_ylim(0, max(y)+0.1)# set the y-axis limit to a multiple of 0.1\n","plt.yticks([i/10 for i in range(11)]) # set yticks with step of 0.1\n","plt.scatter(x, y,color= 'black')\n","ax.set_xlabel('Queries')\n","ax.set_ylabel('Recall')\n","ax.set_title('Recall with double word queries')\n","ax.legend(loc='lower right')\n","# display the plot\n","plt.show()"]},{"cell_type":"markdown","metadata":{"id":"cH5nEKCbk_rv"},"source":["**# precision line graph for sentence based queries**"]},{"cell_type":"code","execution_count":87,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":472},"executionInfo":{"elapsed":478,"status":"ok","timestamp":1702930571570,"user":{"displayName":"Khubaib Ahmad","userId":"13992953376961780846"},"user_tz":-300},"id":"Qd36Rn7VkRvp","outputId":"8bb2a5a0-e5e3-4fc7-971b-4b49acbe36b5"},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["import matplotlib.pyplot as plt\n","x = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19', '20','21','22','23','24','25']\n","y = [0.99, 0.99,0.99, 0.98, 0.98, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.98, 0.99, 0.99, 0.99, 0.99, 0.98, 0.99, 0.99]\n","width = 1.5\n","fig, ax = plt.subplots()\n","# plot line chart\n","ax.plot(x, y,color='orange', label='Precision')\n","ax.set_ylim(0, max(y)+0.1)# set the y-axis limit to a multiple of 0.1\n","plt.yticks([i/10 for i in range(11)]) # set yticks with step of 0.1\n","plt.scatter(x, y,color= 'orange')\n","ax.set_xlabel('Queries')\n","ax.set_ylabel('Precision')\n","ax.set_title('Precision with sentence based queries')\n","ax.legend(loc='lower right')\n","# display the plot\n","plt.show()"]},{"cell_type":"markdown","metadata":{"id":"11FRVnAplOIq"},"source":["**# recall line graph for sentence based queries**"]},{"cell_type":"code","execution_count":88,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":472},"executionInfo":{"elapsed":1108,"status":"ok","timestamp":1702930619609,"user":{"displayName":"Khubaib Ahmad","userId":"13992953376961780846"},"user_tz":-300},"id":"Zyc0hAb4kNXC","outputId":"825c5a1f-377d-47d1-804d-cb3d2562d068"},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["import matplotlib.pyplot as plt\n","x = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19', '20','21','22','23','24','25']\n","y = [0.99, 0.99,0.99, 0.98, 0.98, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.98, 0.99, 0.99, 0.99, 0.99, 0.98, 0.99, 0.99]\n","width = 1.5\n","fig, ax = plt.subplots()\n","# plot line chart\n","ax.plot(x, y,color='brown', label='Recall')\n","ax.set_ylim(0, max(y)+0.1)# set the y-axis limit to a multiple of 0.1\n","plt.yticks([i/10 for i in range(11)]) # set yticks with step of 0.1\n","plt.scatter(x, y,color= 'brown')\n","ax.set_xlabel('Queries')\n","ax.set_ylabel('Recall')\n","ax.set_title('Recall with sentence based queries')\n","ax.legend(loc='lower right')\n","# display the plot\n","plt.show()"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]}],"metadata":{"colab":{"authorship_tag":"ABX9TyM032Jk9WAvN1DDtbUH4KNg","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.4"}},"nbformat":4,"nbformat_minor":0}