Skip to content

Commit

Permalink
Add Python File
Browse files Browse the repository at this point in the history
  • Loading branch information
deept-agl committed Jan 30, 2025
0 parents commit 2dea471
Show file tree
Hide file tree
Showing 3 changed files with 341 additions and 0 deletions.
341 changes: 341 additions & 0 deletions Amazon_web_scraping_ipad_data.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,341 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "409de4be-92f8-40d6-b84a-0b6f5c4ef32e",
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"import requests\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "cc9b00ec-666b-44f6-a77d-4d3abcb5643a",
"metadata": {},
"outputs": [],
"source": [
"# Function to extract Product Title\n",
"def get_title(soup):\n",
"\n",
" try:\n",
" # Outer Tag Object\n",
" title = soup.find(\"span\", attrs={\"id\":'productTitle'})\n",
" \n",
" # Inner NavigatableString Object\n",
" title_value = title.text\n",
"\n",
" # Title as a string value\n",
" title_string = title_value.strip()\n",
"\n",
" except AttributeError:\n",
" title_string = \"\"\n",
"\n",
" return title_string\n",
"\n",
"# Function to extract Product Price\n",
"def get_price(soup):\n",
"\n",
" try:\n",
" price = soup.find(\"span\", attrs={\"class\":\"a-price-whole\"}).text.strip()\n",
"\n",
" except AttributeError:\n",
" price = \"\"\n",
"\n",
" return price\n",
"\n",
"# Function to extract Product Rating\n",
"def get_rating(soup):\n",
"\n",
" try:\n",
" rating = soup.find(\"i\", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()\n",
" \n",
" except AttributeError:\n",
" try:\n",
" rating = soup.find(\"span\", attrs={'class':'a-icon-alt'}).string.strip()\n",
" except:\n",
" rating = \"\"\t\n",
"\n",
" return rating\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "5a7c7175-0069-434b-9b80-d2d03c26318f",
"metadata": {},
"outputs": [],
"source": [
"\n",
"if __name__ == '__main__':\n",
"\n",
" # add your user agent \n",
"\n",
" \n",
" HEADERS = ({'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/109.0','Accept-Language':'en-US,en;q=0.5','Connection':'keep-alive','Referer':'http://www.google.com/'})\n",
"\n",
" # The webpage URL\n",
" URL =\"https://www.amazon.in/s?k=ipad&ref=nb_sb_noss\"\n",
"\n",
" # HTTP Request\n",
" webpage = requests.get(URL, headers=HEADERS)\n",
"\n",
" # Soup Object containing all data\n",
" soup = BeautifulSoup(webpage.content, \"html.parser\")\n",
"\n",
" # Fetch links as List of Tag Objects\n",
" links = soup.find_all(\"a\", attrs={'class':'a-link-normal s-line-clamp-2 s-link-style a-text-normal'})\n",
"\n",
" # Store the links\n",
" links_list = []\n",
"\n",
" # Loop for extracting links from Tag Objects\n",
" for link in links:\n",
" links_list.append(link.get('href'))\n",
"\n",
" d = {\"title\":[], \"price\":[], \"rating\":[]}\n",
" \n",
" # Loop for extracting product details from each link \n",
" for link in links_list:\n",
" new_webpage = requests.get(\"https://www.amazon.in\" + link, headers=HEADERS)\n",
"\n",
" new_soup = BeautifulSoup(new_webpage.content, \"html.parser\")\n",
"\n",
" # Function calls to display all necessary product information\n",
" d['title'].append(get_title(new_soup))\n",
" d['price'].append(get_price(new_soup))\n",
" d['rating'].append(get_rating(new_soup))\n",
"\n",
"\n",
" amazon_df = pd.DataFrame.from_dict(d)\n",
" amazon_df['title'] = amazon_df['title'].replace('', np.nan).infer_objects(copy=False)\n",
" amazon_df = amazon_df.dropna(subset=['title'])\n",
" amazon_df.to_csv(\"amazon_data.csv\", header=True, index=False)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3a9898bc-a0ad-4dca-a4c4-792f3f452b77",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>title</th>\n",
" <th>price</th>\n",
" <th>rating</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Apple iPad (10th Generation): with A14 Bionic ...</td>\n",
" <td>33,899.</td>\n",
" <td>4.5 out of 5 stars</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Apple iPad Air 11″ (M2): Liquid Retina Display...</td>\n",
" <td>56,900.</td>\n",
" <td>4.5 out of 5 stars</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Apple iPad (10th Generation): with A14 Bionic ...</td>\n",
" <td>39,999.</td>\n",
" <td>4.5 out of 5 stars</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Apple iPad (10th Generation): with A14 Bionic ...</td>\n",
" <td>33,899.</td>\n",
" <td>4.5 out of 5 stars</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Apple iPad Air 11″ (M2): Liquid Retina Display...</td>\n",
" <td>56,900.</td>\n",
" <td>4.5 out of 5 stars</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Apple iPad Air 11″ (M2): Liquid Retina Display...</td>\n",
" <td>56,900.</td>\n",
" <td>4.5 out of 5 stars</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Apple iPad Air 11″ (M2): Liquid Retina Display...</td>\n",
" <td>59,467.</td>\n",
" <td>4.5 out of 5 stars</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Apple iPad (9th Generation): with A13 Bionic c...</td>\n",
" <td>30,400.</td>\n",
" <td>4.6 out of 5 stars</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Apple iPad Pro 11″ (M4): Ultra Retina XDR Disp...</td>\n",
" <td>99,900.</td>\n",
" <td>4.5 out of 5 stars</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Apple iPad Pro 13″ (M4): Ultra Retina XDR Disp...</td>\n",
" <td>1,29,900.</td>\n",
" <td>4.3 out of 5 stars</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Apple iPad Mini (A17 Pro): Apple Intelligence,...</td>\n",
" <td>49,900.</td>\n",
" <td>4.6 out of 5 stars</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Apple iPad (9th Generation): with A13 Bionic c...</td>\n",
" <td>30,400.</td>\n",
" <td>4.6 out of 5 stars</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Apple iPad Pro 11″ (M4): Ultra Retina XDR Disp...</td>\n",
" <td>1,09,900.</td>\n",
" <td>4.5 out of 5 stars</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Apple iPad (10th Generation): with A14 Bionic ...</td>\n",
" <td>49,900.</td>\n",
" <td>4.5 out of 5 stars</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Apple iPad (10th Generation): with A14 Bionic ...</td>\n",
" <td>49,900.</td>\n",
" <td>4.5 out of 5 stars</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>DIGIROOT iPad Pencil【NO.1 Sales in US&amp;EU】,13 M...</td>\n",
" <td>1,549.</td>\n",
" <td>4.4 out of 5 stars</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Apple 2020 iPad with A12 Bionic chip (10.2-inc...</td>\n",
" <td>29,900.</td>\n",
" <td>4.7 out of 5 stars</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Apple iPad Air 11″ (M2): Liquid Retina Display...</td>\n",
" <td>69,900.</td>\n",
" <td>4.5 out of 5 stars</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" title price \\\n",
"0 Apple iPad (10th Generation): with A14 Bionic ... 33,899. \n",
"1 Apple iPad Air 11″ (M2): Liquid Retina Display... 56,900. \n",
"2 Apple iPad (10th Generation): with A14 Bionic ... 39,999. \n",
"3 Apple iPad (10th Generation): with A14 Bionic ... 33,899. \n",
"4 Apple iPad Air 11″ (M2): Liquid Retina Display... 56,900. \n",
"5 Apple iPad Air 11″ (M2): Liquid Retina Display... 56,900. \n",
"6 Apple iPad Air 11″ (M2): Liquid Retina Display... 59,467. \n",
"7 Apple iPad (9th Generation): with A13 Bionic c... 30,400. \n",
"8 Apple iPad Pro 11″ (M4): Ultra Retina XDR Disp... 99,900. \n",
"9 Apple iPad Pro 13″ (M4): Ultra Retina XDR Disp... 1,29,900. \n",
"10 Apple iPad Mini (A17 Pro): Apple Intelligence,... 49,900. \n",
"11 Apple iPad (9th Generation): with A13 Bionic c... 30,400. \n",
"12 Apple iPad Pro 11″ (M4): Ultra Retina XDR Disp... 1,09,900. \n",
"13 Apple iPad (10th Generation): with A14 Bionic ... 49,900. \n",
"14 Apple iPad (10th Generation): with A14 Bionic ... 49,900. \n",
"15 DIGIROOT iPad Pencil【NO.1 Sales in US&EU】,13 M... 1,549. \n",
"16 Apple 2020 iPad with A12 Bionic chip (10.2-inc... 29,900. \n",
"17 Apple iPad Air 11″ (M2): Liquid Retina Display... 69,900. \n",
"\n",
" rating \n",
"0 4.5 out of 5 stars \n",
"1 4.5 out of 5 stars \n",
"2 4.5 out of 5 stars \n",
"3 4.5 out of 5 stars \n",
"4 4.5 out of 5 stars \n",
"5 4.5 out of 5 stars \n",
"6 4.5 out of 5 stars \n",
"7 4.6 out of 5 stars \n",
"8 4.5 out of 5 stars \n",
"9 4.3 out of 5 stars \n",
"10 4.6 out of 5 stars \n",
"11 4.6 out of 5 stars \n",
"12 4.5 out of 5 stars \n",
"13 4.5 out of 5 stars \n",
"14 4.5 out of 5 stars \n",
"15 4.4 out of 5 stars \n",
"16 4.7 out of 5 stars \n",
"17 4.5 out of 5 stars "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"amazon_df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Binary file added Snippets/Amazon_Website_Ipad_link.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Snippets/Final_amazon_IPAD_datarame.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 2dea471

Please sign in to comment.