diff --git a/2.Automate The News/1.news-extract-data.py b/2.Automate The News/1.news-extract-data.py index 530a5af..3b75a41 100644 --- a/2.Automate The News/1.news-extract-data.py +++ b/2.Automate The News/1.news-extract-data.py @@ -1,25 +1,40 @@ from selenium import webdriver from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC import pandas as pd web = 'https://www.thesun.co.uk/sport/football/' -path = '/Users/frankandrade/Downloads/chromedriver' # introduce path here +path = 'chromedriver.exe' # Update the path to ChromeDriver # Creating the driver driver_service = Service(executable_path=path) driver = webdriver.Chrome(service=driver_service) driver.get(web) -# Finding Elements -containers = driver.find_elements(by='xpath', value='//div[@class="teaser__copy-container"]') +# Wait for elements to load +wait = WebDriverWait(driver, 10) +containers = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//div[@class="teaser__copy-container"]'))) titles = [] subtitles = [] links = [] + for container in containers: - title = container.find_element(by='xpath', value='./a/h2').text - subtitle = container.find_element(by='xpath', value='./a/p').text - link = container.find_element(by='xpath', value='./a').get_attribute('href') + try: + title = container.find_element(By.XPATH, './a/h3').text + except: + title = "N/A" + try: + subtitle = container.find_element(By.XPATH, './a/p').text + except: + subtitle = "N/A" + try: + link = container.find_element(By.XPATH, './a').get_attribute('href') + except: + link = "N/A" + titles.append(title) subtitles.append(subtitle) links.append(link) @@ -27,6 +42,6 @@ # Exporting data to a CSV file my_dict = {'title': titles, 'subtitle': subtitles, 'link': links} df_headlines = pd.DataFrame(my_dict) -df_headlines.to_csv('headline.csv') +df_headlines.to_csv('headline.csv', index=False) driver.quit() diff --git a/2.Automate The News/2.news-headless.py b/2.Automate The News/2.news-headless.py index 8023e4b..cb336d2 100644 --- a/2.Automate The News/2.news-headless.py +++ b/2.Automate The News/2.news-headless.py @@ -4,7 +4,7 @@ import pandas as pd web = 'https://www.thesun.co.uk/sport/football/' -path = '/Users/frankandrade/Downloads/chromedriver' # introduce path here +path = 'chromedriver.exe' # introduce path here # add headless mode options = Options()