diff --git a/Contact_Me.txt b/Contact_Me.txt new file mode 100644 index 0000000..1275e4b --- /dev/null +++ b/Contact_Me.txt @@ -0,0 +1,3 @@ +Name: Rishabh Agnihotri +email id: rishabhagnihotri272@gmail.com +mobile no: 9198468894 \ No newline at end of file diff --git a/code.ipynb b/code.ipynb new file mode 100644 index 0000000..11d9888 --- /dev/null +++ b/code.ipynb @@ -0,0 +1,106 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import re\n", + "from transformers import pipeline\n", + "import wikipediaapi\n", + "\n", + "# Load the JSON data with UTF-8 encoding\n", + "with open('C:\\\\Users\\\\hp\\\\Desktop\\\\rishu_assignment\\\\news.article.json', 'r', encoding='utf-8') as file:\n", + " data = json.load(file)\n", + "\n", + "def preprocess_text(text):\n", + " # Remove special characters and extra spaces\n", + " text = re.sub(r'\\s+', ' ', text)\n", + " text = re.sub(r'[^\\w\\s]', '', text)\n", + " return text\n", + "\n", + "# Preprocess each article's text\n", + "for article in data:\n", + " article['cleaned_text'] = preprocess_text(article['articleBody'])\n", + "\n", + "# Filter relevant articles based on specific keywords\n", + "def is_relevant_article(article):\n", + " specific_keywords = [\"Al-Shifa Hospital\", \"Gaza hospital\", \"Israel airstrike\", \"Hamas attack\", \"Gaza conflict\"]\n", + " return any(keyword.lower() in article['cleaned_text'].lower() for keyword in specific_keywords)\n", + "\n", + "relevant_articles = [article for article in data if is_relevant_article(article)]\n", + "\n", + "# Load a pre-trained QA model\n", + "qa_pipeline = pipeline(\"question-answering\", model=\"distilbert-base-uncased-distilled-squad\")\n", + "\n", + "# Function to get additional context from Wikipedia with a specified user-agent\n", + "def get_wikipedia_context(topic):\n", + " user_agent = 'MyUserAgent/1.0 (myemail@example.com)' # Replace with your own details\n", + " wiki_wiki = wikipediaapi.Wikipedia('en', headers={'User-Agent': user_agent})\n", + " page = wiki_wiki.page(topic)\n", + " if page.exists():\n", + " return page.summary\n", + " else:\n", + " return \"\"\n", + "\n", + "# Function to get combined context\n", + "def get_combined_context(relevant_articles, wiki_topic, max_length=5000):\n", + " context = \"\"\n", + " for article in relevant_articles:\n", + " if len(context) + len(article['cleaned_text']) > max_length:\n", + " break\n", + " context += article['cleaned_text'] + \" \"\n", + " \n", + " wiki_context = get_wikipedia_context(wiki_topic)\n", + " combined_context = context + \" \" + wiki_context\n", + " \n", + " # Debugging: Print combined context\n", + " print(\"Combined Context: \", combined_context[:1000]) # Print first 1000 characters to check content\n", + " \n", + " return combined_context\n", + "\n", + "# Function to get answer to a question\n", + "def get_answer(question, relevant_articles, wiki_topic):\n", + " combined_context = get_combined_context(relevant_articles, wiki_topic)\n", + " result = qa_pipeline(question=question, context=combined_context)\n", + " return result['answer']\n", + "\n", + "# Example question\n", + "question = \"What happened at the Al-Shifa Hospital?\"\n", + "wiki_topic = \"Al-Shifa Hospital Gaza conflict\"\n", + "answer = get_answer(question, relevant_articles, wiki_topic)\n", + "print(\"Answer: \", answer)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/explanation.txt b/explanation.txt new file mode 100644 index 0000000..2ee2b45 --- /dev/null +++ b/explanation.txt @@ -0,0 +1,19 @@ +I have done first part of the Q1 and since there is mentioned that I need to attempt any of 2 so I just did first one, and below is the explanation +of the same step by step: + +1. Data Loading and Preprocessing +First, we load the JSON file containing the news articles. Since the data might contain special characters and extra spaces, +we need to clean it up. +2. Filtering Relevant Articles +To ensure our QA system focuses on the Israel-Hamas war, we filter out irrelevant articles. We define a list of keywords related +to the conflict and keep only those articles that contain these keywords. +3. Load a Pre-trained QA Model +We use a pre-trained QA model from Hugging Face's Transformers library. In this case, we choose the distilbert-base-uncased-distilled-squad +model for its efficiency and accuracy in question answering tasks. +4. Augment Context with Wikipedia +To enhance the context available for the QA system, we fetch additional information from Wikipedia. This step helps provide a more +comprehensive background for the QA model. +5. Combine Context +We combine the filtered articles with the Wikipedia context. This combined context is then used by the QA model to answer questions. +6. Answer Questions +Finally, we create a function that uses the combined context to answer a given question. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9b08d67 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +json +re +transformers +wikipediaapi \ No newline at end of file