-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
101 lines (89 loc) · 4.65 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import matplotlib.pyplot as plt
import pandas as pd
import requests
import streamlit as st
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from wordcloud import WordCloud
import main_functions
# Known bugs:
# NYTimes API returns an error message JSON if too many requests are sent in a short span of time, this will cause
# the frequency distribution and wordclouds to generate an error on the page.
# Can make a method to handle wordcloud generation, as there is currently redundant code in the top stories and most
# popular articles sections
api_key_dict = main_functions.read_from_file("JSON_Files/api_key.json")
api_key = api_key_dict["api_key"]
st.title("Article Word Frequency Visualizer")
st.header("Word Map Generator")
st.subheader("Part A - The Stories API")
st.write("This app uses the Top Stories API to display the most common words used in the top current articles "
"based on a specified topic selected by the user. The data is displayed as a line chart and as a wordcloud "
"image.")
st.subheader("1 - Topic Selection")
user_name = st.text_input("Please enter your name", "")
# Catch user input on what topic articles they want to generate the frequency distribution and wordcloud for
option = st.selectbox(
"Select a topic of your interest",
("Arts", "Automobiles", "Books", "Business", "Fashion", "Food", "Health", "Home", "Insider", "Magazine",
"Movies", "NYRegion", "Obituaries", "Opinion", "Politics", "RealEstate", "Science", "Sports", "SundayReview",
"Technology", "Theater", "T-Magazine", "Travel", "Upshot", "US", "World"), index=0)
# Use NYTimes Top Stories API to create json response containing articles of user's selected topic
stories_url = f"https://api.nytimes.com/svc/topstories/v2/{option.lower()}.json?api-key={api_key}"
stories_response = requests.get(stories_url).json()
main_functions.save_to_file(stories_response, "JSON_Files/response.json")
st.write("Hi %s, you have selected the %s topic." % (user_name, option))
st.subheader("2 - Frequency Distribution")
# Initialize empty string to append abstracts to
story_abstracts = ""
# Create a list of words to omit from counting, such as articles "the"," "and", etc.
stopwords = stopwords.words("english")
clean_words = []
fDistribution = st.checkbox("Click here to generate frequency distribution")
if fDistribution:
articles = main_functions.read_from_file("JSON_Files/response.json")
for i in articles["results"]:
story_abstracts = story_abstracts + i["abstract"]
words = word_tokenize(story_abstracts)
# Process all words generated from story abstracts, and if the current word is not in our
# list of omitted words, add it to the list which the frequency distribution will be based on.
for w in words:
if w.isalpha() and w.lower() not in stopwords:
clean_words.append(w.lower())
fDist = FreqDist(clean_words)
common_words = {"List": fDist.most_common(10)}
x_axis = [x[0] for x in common_words["List"]]
y_axis = [y[1] for y in common_words["List"]]
chart_data = pd.DataFrame({"Words": x_axis, "Occurrences": y_axis})
chart_data = chart_data.rename(columns={"Words": "times"}).set_index("times")
st.line_chart(chart_data)
# Generate WordCloud based on user's selected article topic category
st.subheader("3 - Wordcloud")
wordCloud = st.checkbox("Click here to generate wordcloud")
if wordCloud:
articles = main_functions.read_from_file("JSON_Files/response.json")
for i in articles["results"]:
story_abstracts = story_abstracts + i["abstract"]
wordcloud = WordCloud().generate(story_abstracts)
plt.figure(figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis("off")
st.pyplot(plt)
st.write("Wordcloud generated for %s topic." % option)
# Generate wordcloud based on the most shared, emailed, or viewed articles in the last X days.
st.subheader("Part B - Most Popular Articles")
st.write("Select if you want to see the most shared, emailed, or viewed articles.")
articleType = st.selectbox("Select your preferred set of articles", ("Shared", "Emailed", "Viewed"))
timePeriod = st.selectbox("Select the period of time (last days)", ("1", "7", "30"))
popular_url = f"https://api.nytimes.com/svc/mostpopular/v2/{articleType.lower()}/{timePeriod.lower()}.json?api-key={api_key}"
popular_response = requests.get(popular_url).json()
pop_abstracts = ""
pop_clean_words = []
articles = main_functions.read_from_file("JSON_Files/response2.json")
for i in articles["results"]:
pop_abstracts = pop_abstracts + i["abstract"]
pop_wordcloud = WordCloud().generate(pop_abstracts)
plt.figure(figsize=(12, 12))
plt.imshow(pop_wordcloud)
plt.axis("off")
st.pyplot(plt)