-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
68 lines (50 loc) · 1.97 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import pandas as pd
import os
# Create a header
headers = {'User-agent': 'Mozilla/5.0'}
# News Source
URL = 'https://www.bbc.com/news'
# Requests the webpage
request = requests.get(URL, headers=headers)
html = request.content
# Create some soup
soup = BeautifulSoup(html, 'html.parser')
def bbc_news_scraper(keyword=None):
news_list = []
# Finds all the headers in BBC Home
for news in soup.findAll('div', class_='gs-c-promo'):
headline = news.find('h3', class_='gs-c-promo-heading__title')
date = news.find('time', class_='qa-status-date')
if headline and date:
news_title = headline.get_text().strip()
news_date = date.get_text().strip()
if news_title not in news_list:
if 'bbc' not in news_title:
news_list.append((news_title, news_date))
# Store news headlines in a DataFrame
news_df = pd.DataFrame(news_list, columns=['Headline', 'Posted'])
# Goes through the list and searches for the keyword
if keyword:
for i, (title, date) in enumerate(news_list):
if keyword.lower() in title.lower():
news_df.loc[i, 'Keyword'] = keyword
return news_df
# Display source and current date & time
print("Source:", URL)
print("Date & Time:", datetime.now().strftime("%b %d, %Y | %I:%M %p\n"))
# Call the function without specifying a keyword
news_dataframe = bbc_news_scraper()
# Display headlines in console output
print(news_dataframe)
# Ask user for file path and save the DataFrame to a CSV file
file_path = r'C:\Visual Studio Code (Workspace)\Web\bbc_news_headlines.csv'
if os.path.exists(file_path):
os.remove(file_path) # Delete existing file if it exists
news_dataframe.to_csv(file_path, index=False)
if os.path.exists(file_path):
print(f"\nThe scraped data is saved to '{file_path}'.")
else:
print("\nScraped data was not saved.")