-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 2dea471
Showing
3 changed files
with
341 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,341 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "409de4be-92f8-40d6-b84a-0b6f5c4ef32e", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from bs4 import BeautifulSoup\n", | ||
"import requests\n", | ||
"import pandas as pd\n", | ||
"import numpy as np" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "cc9b00ec-666b-44f6-a77d-4d3abcb5643a", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Function to extract Product Title\n", | ||
"def get_title(soup):\n", | ||
"\n", | ||
" try:\n", | ||
" # Outer Tag Object\n", | ||
" title = soup.find(\"span\", attrs={\"id\":'productTitle'})\n", | ||
" \n", | ||
" # Inner NavigatableString Object\n", | ||
" title_value = title.text\n", | ||
"\n", | ||
" # Title as a string value\n", | ||
" title_string = title_value.strip()\n", | ||
"\n", | ||
" except AttributeError:\n", | ||
" title_string = \"\"\n", | ||
"\n", | ||
" return title_string\n", | ||
"\n", | ||
"# Function to extract Product Price\n", | ||
"def get_price(soup):\n", | ||
"\n", | ||
" try:\n", | ||
" price = soup.find(\"span\", attrs={\"class\":\"a-price-whole\"}).text.strip()\n", | ||
"\n", | ||
" except AttributeError:\n", | ||
" price = \"\"\n", | ||
"\n", | ||
" return price\n", | ||
"\n", | ||
"# Function to extract Product Rating\n", | ||
"def get_rating(soup):\n", | ||
"\n", | ||
" try:\n", | ||
" rating = soup.find(\"i\", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()\n", | ||
" \n", | ||
" except AttributeError:\n", | ||
" try:\n", | ||
" rating = soup.find(\"span\", attrs={'class':'a-icon-alt'}).string.strip()\n", | ||
" except:\n", | ||
" rating = \"\"\t\n", | ||
"\n", | ||
" return rating\n", | ||
"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "5a7c7175-0069-434b-9b80-d2d03c26318f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"\n", | ||
"if __name__ == '__main__':\n", | ||
"\n", | ||
" # add your user agent \n", | ||
"\n", | ||
" \n", | ||
" HEADERS = ({'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/109.0','Accept-Language':'en-US,en;q=0.5','Connection':'keep-alive','Referer':'http://www.google.com/'})\n", | ||
"\n", | ||
" # The webpage URL\n", | ||
" URL =\"https://www.amazon.in/s?k=ipad&ref=nb_sb_noss\"\n", | ||
"\n", | ||
" # HTTP Request\n", | ||
" webpage = requests.get(URL, headers=HEADERS)\n", | ||
"\n", | ||
" # Soup Object containing all data\n", | ||
" soup = BeautifulSoup(webpage.content, \"html.parser\")\n", | ||
"\n", | ||
" # Fetch links as List of Tag Objects\n", | ||
" links = soup.find_all(\"a\", attrs={'class':'a-link-normal s-line-clamp-2 s-link-style a-text-normal'})\n", | ||
"\n", | ||
" # Store the links\n", | ||
" links_list = []\n", | ||
"\n", | ||
" # Loop for extracting links from Tag Objects\n", | ||
" for link in links:\n", | ||
" links_list.append(link.get('href'))\n", | ||
"\n", | ||
" d = {\"title\":[], \"price\":[], \"rating\":[]}\n", | ||
" \n", | ||
" # Loop for extracting product details from each link \n", | ||
" for link in links_list:\n", | ||
" new_webpage = requests.get(\"https://www.amazon.in\" + link, headers=HEADERS)\n", | ||
"\n", | ||
" new_soup = BeautifulSoup(new_webpage.content, \"html.parser\")\n", | ||
"\n", | ||
" # Function calls to display all necessary product information\n", | ||
" d['title'].append(get_title(new_soup))\n", | ||
" d['price'].append(get_price(new_soup))\n", | ||
" d['rating'].append(get_rating(new_soup))\n", | ||
"\n", | ||
"\n", | ||
" amazon_df = pd.DataFrame.from_dict(d)\n", | ||
" amazon_df['title'] = amazon_df['title'].replace('', np.nan).infer_objects(copy=False)\n", | ||
" amazon_df = amazon_df.dropna(subset=['title'])\n", | ||
" amazon_df.to_csv(\"amazon_data.csv\", header=True, index=False)\n", | ||
" " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "3a9898bc-a0ad-4dca-a4c4-792f3f452b77", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>title</th>\n", | ||
" <th>price</th>\n", | ||
" <th>rating</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>0</th>\n", | ||
" <td>Apple iPad (10th Generation): with A14 Bionic ...</td>\n", | ||
" <td>33,899.</td>\n", | ||
" <td>4.5 out of 5 stars</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1</th>\n", | ||
" <td>Apple iPad Air 11″ (M2): Liquid Retina Display...</td>\n", | ||
" <td>56,900.</td>\n", | ||
" <td>4.5 out of 5 stars</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>2</th>\n", | ||
" <td>Apple iPad (10th Generation): with A14 Bionic ...</td>\n", | ||
" <td>39,999.</td>\n", | ||
" <td>4.5 out of 5 stars</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>3</th>\n", | ||
" <td>Apple iPad (10th Generation): with A14 Bionic ...</td>\n", | ||
" <td>33,899.</td>\n", | ||
" <td>4.5 out of 5 stars</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>4</th>\n", | ||
" <td>Apple iPad Air 11″ (M2): Liquid Retina Display...</td>\n", | ||
" <td>56,900.</td>\n", | ||
" <td>4.5 out of 5 stars</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>5</th>\n", | ||
" <td>Apple iPad Air 11″ (M2): Liquid Retina Display...</td>\n", | ||
" <td>56,900.</td>\n", | ||
" <td>4.5 out of 5 stars</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>6</th>\n", | ||
" <td>Apple iPad Air 11″ (M2): Liquid Retina Display...</td>\n", | ||
" <td>59,467.</td>\n", | ||
" <td>4.5 out of 5 stars</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>7</th>\n", | ||
" <td>Apple iPad (9th Generation): with A13 Bionic c...</td>\n", | ||
" <td>30,400.</td>\n", | ||
" <td>4.6 out of 5 stars</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>8</th>\n", | ||
" <td>Apple iPad Pro 11″ (M4): Ultra Retina XDR Disp...</td>\n", | ||
" <td>99,900.</td>\n", | ||
" <td>4.5 out of 5 stars</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>9</th>\n", | ||
" <td>Apple iPad Pro 13″ (M4): Ultra Retina XDR Disp...</td>\n", | ||
" <td>1,29,900.</td>\n", | ||
" <td>4.3 out of 5 stars</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>10</th>\n", | ||
" <td>Apple iPad Mini (A17 Pro): Apple Intelligence,...</td>\n", | ||
" <td>49,900.</td>\n", | ||
" <td>4.6 out of 5 stars</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>11</th>\n", | ||
" <td>Apple iPad (9th Generation): with A13 Bionic c...</td>\n", | ||
" <td>30,400.</td>\n", | ||
" <td>4.6 out of 5 stars</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>12</th>\n", | ||
" <td>Apple iPad Pro 11″ (M4): Ultra Retina XDR Disp...</td>\n", | ||
" <td>1,09,900.</td>\n", | ||
" <td>4.5 out of 5 stars</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>13</th>\n", | ||
" <td>Apple iPad (10th Generation): with A14 Bionic ...</td>\n", | ||
" <td>49,900.</td>\n", | ||
" <td>4.5 out of 5 stars</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>14</th>\n", | ||
" <td>Apple iPad (10th Generation): with A14 Bionic ...</td>\n", | ||
" <td>49,900.</td>\n", | ||
" <td>4.5 out of 5 stars</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>15</th>\n", | ||
" <td>DIGIROOT iPad Pencil【NO.1 Sales in US&EU】,13 M...</td>\n", | ||
" <td>1,549.</td>\n", | ||
" <td>4.4 out of 5 stars</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>16</th>\n", | ||
" <td>Apple 2020 iPad with A12 Bionic chip (10.2-inc...</td>\n", | ||
" <td>29,900.</td>\n", | ||
" <td>4.7 out of 5 stars</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>17</th>\n", | ||
" <td>Apple iPad Air 11″ (M2): Liquid Retina Display...</td>\n", | ||
" <td>69,900.</td>\n", | ||
" <td>4.5 out of 5 stars</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" title price \\\n", | ||
"0 Apple iPad (10th Generation): with A14 Bionic ... 33,899. \n", | ||
"1 Apple iPad Air 11″ (M2): Liquid Retina Display... 56,900. \n", | ||
"2 Apple iPad (10th Generation): with A14 Bionic ... 39,999. \n", | ||
"3 Apple iPad (10th Generation): with A14 Bionic ... 33,899. \n", | ||
"4 Apple iPad Air 11″ (M2): Liquid Retina Display... 56,900. \n", | ||
"5 Apple iPad Air 11″ (M2): Liquid Retina Display... 56,900. \n", | ||
"6 Apple iPad Air 11″ (M2): Liquid Retina Display... 59,467. \n", | ||
"7 Apple iPad (9th Generation): with A13 Bionic c... 30,400. \n", | ||
"8 Apple iPad Pro 11″ (M4): Ultra Retina XDR Disp... 99,900. \n", | ||
"9 Apple iPad Pro 13″ (M4): Ultra Retina XDR Disp... 1,29,900. \n", | ||
"10 Apple iPad Mini (A17 Pro): Apple Intelligence,... 49,900. \n", | ||
"11 Apple iPad (9th Generation): with A13 Bionic c... 30,400. \n", | ||
"12 Apple iPad Pro 11″ (M4): Ultra Retina XDR Disp... 1,09,900. \n", | ||
"13 Apple iPad (10th Generation): with A14 Bionic ... 49,900. \n", | ||
"14 Apple iPad (10th Generation): with A14 Bionic ... 49,900. \n", | ||
"15 DIGIROOT iPad Pencil【NO.1 Sales in US&EU】,13 M... 1,549. \n", | ||
"16 Apple 2020 iPad with A12 Bionic chip (10.2-inc... 29,900. \n", | ||
"17 Apple iPad Air 11″ (M2): Liquid Retina Display... 69,900. \n", | ||
"\n", | ||
" rating \n", | ||
"0 4.5 out of 5 stars \n", | ||
"1 4.5 out of 5 stars \n", | ||
"2 4.5 out of 5 stars \n", | ||
"3 4.5 out of 5 stars \n", | ||
"4 4.5 out of 5 stars \n", | ||
"5 4.5 out of 5 stars \n", | ||
"6 4.5 out of 5 stars \n", | ||
"7 4.6 out of 5 stars \n", | ||
"8 4.5 out of 5 stars \n", | ||
"9 4.3 out of 5 stars \n", | ||
"10 4.6 out of 5 stars \n", | ||
"11 4.6 out of 5 stars \n", | ||
"12 4.5 out of 5 stars \n", | ||
"13 4.5 out of 5 stars \n", | ||
"14 4.5 out of 5 stars \n", | ||
"15 4.4 out of 5 stars \n", | ||
"16 4.7 out of 5 stars \n", | ||
"17 4.5 out of 5 stars " | ||
] | ||
}, | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"amazon_df" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.