abhincho · RA656 · Jun 9, 2024
diff --git a/Contact_Me.txt b/Contact_Me.txt
@@ -0,0 +1,3 @@
+Name: Rishabh Agnihotri
+email id: [email protected]
+mobile no: 9198468894
diff --git a/code.ipynb b/code.ipynb
@@ -0,0 +1,106 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import re\n",
+    "from transformers import pipeline\n",
+    "import wikipediaapi\n",
+    "\n",
+    "# Load the JSON data with UTF-8 encoding\n",
+    "with open('C:\\\\Users\\\\hp\\\\Desktop\\\\rishu_assignment\\\\news.article.json', 'r', encoding='utf-8') as file:\n",
+    "    data = json.load(file)\n",
+    "\n",
+    "def preprocess_text(text):\n",
+    "    # Remove special characters and extra spaces\n",
+    "    text = re.sub(r'\\s+', ' ', text)\n",
+    "    text = re.sub(r'[^\\w\\s]', '', text)\n",
+    "    return text\n",
+    "\n",
+    "# Preprocess each article's text\n",
+    "for article in data:\n",
+    "    article['cleaned_text'] = preprocess_text(article['articleBody'])\n",
+    "\n",
+    "# Filter relevant articles based on specific keywords\n",
+    "def is_relevant_article(article):\n",
+    "    specific_keywords = [\"Al-Shifa Hospital\", \"Gaza hospital\", \"Israel airstrike\", \"Hamas attack\", \"Gaza conflict\"]\n",
+    "    return any(keyword.lower() in article['cleaned_text'].lower() for keyword in specific_keywords)\n",
+    "\n",
+    "relevant_articles = [article for article in data if is_relevant_article(article)]\n",
+    "\n",
+    "# Load a pre-trained QA model\n",
+    "qa_pipeline = pipeline(\"question-answering\", model=\"distilbert-base-uncased-distilled-squad\")\n",
+    "\n",
+    "# Function to get additional context from Wikipedia with a specified user-agent\n",
+    "def get_wikipedia_context(topic):\n",
+    "    user_agent = 'MyUserAgent/1.0 ([email protected])'  # Replace with your own details\n",
+    "    wiki_wiki = wikipediaapi.Wikipedia('en', headers={'User-Agent': user_agent})\n",
+    "    page = wiki_wiki.page(topic)\n",
+    "    if page.exists():\n",
+    "        return page.summary\n",
+    "    else:\n",
+    "        return \"\"\n",
+    "\n",
+    "# Function to get combined context\n",
+    "def get_combined_context(relevant_articles, wiki_topic, max_length=5000):\n",
+    "    context = \"\"\n",
+    "    for article in relevant_articles:\n",
+    "        if len(context) + len(article['cleaned_text']) > max_length:\n",
+    "            break\n",
+    "        context += article['cleaned_text'] + \" \"\n",
+    "    \n",
+    "    wiki_context = get_wikipedia_context(wiki_topic)\n",
+    "    combined_context = context + \" \" + wiki_context\n",
+    "    \n",
+    "    # Debugging: Print combined context\n",
+    "    print(\"Combined Context: \", combined_context[:1000])  # Print first 1000 characters to check content\n",
+    "    \n",
+    "    return combined_context\n",
+    "\n",
+    "# Function to get answer to a question\n",
+    "def get_answer(question, relevant_articles, wiki_topic):\n",
+    "    combined_context = get_combined_context(relevant_articles, wiki_topic)\n",
+    "    result = qa_pipeline(question=question, context=combined_context)\n",
+    "    return result['answer']\n",
+    "\n",
+    "# Example question\n",
+    "question = \"What happened at the Al-Shifa Hospital?\"\n",
+    "wiki_topic = \"Al-Shifa Hospital Gaza conflict\"\n",
+    "answer = get_answer(question, relevant_articles, wiki_topic)\n",
+    "print(\"Answer: \", answer)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/explanation.txt b/explanation.txt
@@ -0,0 +1,19 @@
+I have done first part of the Q1 and since there is mentioned that I need to attempt any of 2 so I just did first one, and below is the explanation
+of the same step by step:
+
+1. Data Loading and Preprocessing
+First, we load the JSON file containing the news articles. Since the data might contain special characters and extra spaces,
+we need to clean it up.
+2. Filtering Relevant Articles
+To ensure our QA system focuses on the Israel-Hamas war, we filter out irrelevant articles. We define a list of keywords related 
+to the conflict and keep only those articles that contain these keywords.
+3. Load a Pre-trained QA Model
+We use a pre-trained QA model from Hugging Face's Transformers library. In this case, we choose the distilbert-base-uncased-distilled-squad
+model for its efficiency and accuracy in question answering tasks.
+4. Augment Context with Wikipedia
+To enhance the context available for the QA system, we fetch additional information from Wikipedia. This step helps provide a more 
+comprehensive background for the QA model.
+5. Combine Context
+We combine the filtered articles with the Wikipedia context. This combined context is then used by the QA model to answer questions.
+6. Answer Questions
+Finally, we create a function that uses the combined context to answer a given question.
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+json
+re
+transformers
+wikipediaapi