info202.py

# -*- coding: utf-8 -*-
"""INFO202.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1cngza8xW9_Ae1_9hGp3vbfEgHBKF0CN3
"""

import requests
from bs4 import BeautifulSoup
from datetime import datetime
from dateutil import parser
import pandas as pd
import nltk
import string
import os
import pandas as pd
import numpy as np
import time
import nltk
import re

nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
!pip install rank_bm25
from google.colab import drive

from IPython.core import display as ICD
from nltk.stem import WordNetLemmatizer
from rank_bm25 import BM25Okapi
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


import json
# url_list = ["https://www.moneycontrol.com/news/tags/companies.html","https://www.moneycontrol.com/news/tags/moneycontrol-analysis.html","https://www.moneycontrol.com/news/business/moneycontrol-research/","https://www.moneycontrol.com/news/business/mutual-funds/","https://www.moneycontrol.com/news/business/personal-finance/","https://www.moneycontrol.com/news/tags/residential.html","https://www.moneycontrol.com/news/tags/commercial.html","https://www.moneycontrol.com/news/tags/urbanreforms.html","https://www.moneycontrol.com/news/tags/real-estate.html","https://www.moneycontrol.com/news/tags/auto.html","https://www.moneycontrol.com/news/politics/","https://www.moneycontrol.com/news/india/","https://www.moneycontrol.com/news/tags/bankingtech.html","https://www.moneycontrol.com/news/tags/insurancetech.html","https://www.moneycontrol.com/news/technology/"]

"""# Web Crawling to get news related links/urls"""

url_list = []
def get_linked_urls(url, html):
        soup = BeautifulSoup(html, 'html.parser')
        for link in soup.find_all('a'):
            path = link.get('href')
            if "https://www.moneycontrol.com/news/" in str(path) and "html" in str(path):
              url_list.append(path)
              if len(url_list) == 10:
                break
            # if path and path.startswith('/'):
            #     path = url.join(url, path)
            # print(path)

url = "https://www.moneycontrol.com/news/"
htm = requests.get(url).text
get_linked_urls(url, htm)
print(url_list)

"""# Web Scraping to get news articles from fetched links/urls
max pages to scrape for each link is set to 3
"""

news_articles = {}
main_news_articles = {}
news_count = 0 
main_json = {}

for main_url in url_list:
  p = 1
  max_pages = 3

  while p < max_pages:
    new_url = main_url+"/page-{}/".format(p)
    url = requests.get(new_url)
    documents = []
    print(new_url)
    data = url.text
    soup = BeautifulSoup(data,'html.parser')
    url_tag = soup.find('a',{'class':'act'})
    url_last = soup.find_all('a',{'class':'last'})
      
    articles = soup.find_all('li',{'class':'clearfix'})
    for article in articles:
      title = article.find('h2').text
      link = article.find('a').get('href')
      date = article.find('span').text
      a = parser.parse(date)
      new_a = a.replace(tzinfo=None)
      year = new_a.year
      news_response = requests.get(link, timeout=15)
      news_data = news_response.text
      news_soup = BeautifulSoup(news_data,'html.parser')

      if news_soup.find('div',{'class':'arti-flow'}):
          news_text = news_soup.find('div',{'class':'arti-flow'})
          for x in news_text.find_all("script"):
              x.decompose()
          for y in news_text.find_all('style'):
              y.decompose()
          try:
              news_text.find_all('a')[-1].decompose()
              news = news_text.text
          except IndexError:
              news = news_text.text
              
      news_articles[news_count] =  {"headline": title, "text": news, "year": str(year)}
      main_news_articles.update(news_articles)

      news_count+=1

    else:
      print('\nLimit set for maximum pages\n')
    p+=1
    
  print(len(main_news_articles))

# Commented out IPython magic to ensure Python compatibility.
mnc = pd.DataFrame.from_dict(main_news_articles, orient='index').drop_duplicates()
mnc = mnc.reset_index()
mnc

drive.mount('/content/drive')
# %cd '/content/drive/MyDrive/'

# mnc.to_csv('mnc.csv')
mnc.to_csv('mnc.csv')
# print(news_count)

"""# Transform and tokenize words"""

stop_words = set(stopwords.words('english'))
porter = PorterStemmer()

def word_transform(text):
  if not text:
        print('The text to be tokenized is a None type. Defaulting to blank string.')
        text = ''
  text = text.translate(str.maketrans('', '', string.punctuation))
  tokens = nltk.tokenize.word_tokenize(text)
  words = [word for word in tokens if word.isalpha()]
  words = [w for w in words if not w in stop_words]
  stemmed = [porter.stem(word) for word in words]
  return stemmed

# mnc_list = mnc.text.to_list()
# # print(mnc_list)
# list_tokenized = [word_transform(l) for l in mnc_list]
# print(list_tokenized)

"""# Calculate frequency of each word in all news articles

"""

def get_list_tokenised(data):
  mnc_list = data.text.to_list()
  list_tokenized = [word_transform(l) for l in mnc_list]
  return list_tokenized

def calc_doc_freq(data):
  dfreq = {}
  list_tokenized = get_list_tokenised(data)
  for i in range(len(list_tokenized)):
      tokens = list_tokenized[i]
      for w in tokens:
          try:
              dfreq[w].add(i)
          except:
              dfreq[w] = {i}

  for i in dfreq:
      dfreq[i] = len(dfreq[i])

  return(dfreq)

dfreq = calc_doc_freq(mnc)
print(dfreq)

"""# New Calculate TFIDF for each word in each article"""

import numpy as np
from collections import Counter

def article_freq(word, dfreq):
      const = 0
      try:
          const = dfreq[word]
      except:
          pass
      return const

def cal_tfidf(dfreq, mnc):
  doc = 0
  tf_idf = {}
  len_mnc = len(mnc['text'])
  words_set = len(dfreq)
  names = [i for i in dfreq.keys()]
  print(len(names))
  tf_idf = pd.DataFrame(np.zeros((len_mnc, words_set)), columns= names)
  list_tokenized = get_list_tokenised(mnc)
  

  for i in range(len(mnc)):
      tokens = list_tokenized[i]
      counter = Counter(tokens)
      words_count = len(tokens)
      
      for token in np.unique(tokens):
        #term frequency
        tf = counter[token]/words_count
        #article frequency
        df = article_freq(token,dfreq)
        #inverse document frequency
        idf = np.log((len(mnc)+1)/(df+1))
        # print(idf)
        tf_idf[token][i] = tf*idf
        # print(tf_idf[token])
        # df_tf_idf[token][i] = df_tf[w][i] * idf[w]
      # doc += 1
  return tf_idf

tfidf_score = cal_tfidf(dfreq, mnc)
tfidf_score

"""# TF IDF vectorizer

# Cosine similarity of each article in the dataframe "mnc"
"""

vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(mnc["text"])
start = time.time()
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print("Time taken: %s seconds" % (time.time() - start))

h = [row for row in mnc.headline]
pretty_cos = pd.DataFrame(cosine_sim, columns = h, index = h)
pretty_cos.head()

"""# Linear Kernel
Since the tf-idf functionality in sklearn.feature_extraction.text produces normalized vectors, hence, in this case, cosine_similarity is equivalent to linear_kernel. (only slower in case of large datasets)
"""

start = time.time()
linear_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print("Time taken: %s seconds" % (time.time() - start))

pretty_cos = pd.DataFrame(linear_sim, columns = h, index = h)
pretty_cos.head()

"""# *Get News Article Recommendation based on headline as input*"""

def get_news_recommendations(title, data, cosine_sim, indices):
    id = indices[title]
    sim_scores = list(enumerate(cosine_sim[id]))
    #Sort based on the sim scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores for 10 most similar news headlines
    sim_scores = sim_scores[1:6]
    print(sim_scores)
    news_indices = [i[0] for i in sim_scores]
    # Get the top 10 most similar news headlines
    return pd.DataFrame(data.iloc[news_indices])

rec_vector = TfidfVectorizer(stop_words='english')

headlines = mnc['text'].drop_duplicates()

# Construct the TF-IDF matrix
tfidf_matrix = rec_vector.fit_transform(headlines.values.astype('U'))

# Generate the cosine similarity matrix
cosine_sim_headlines = cosine_similarity(tfidf_matrix, tfidf_matrix)

indices = pd.Series(mnc.index, index=mnc['headline']).dropna()

start = time.time()
cosine_recc = get_news_recommendations("Edtech’s failure is Indian education sector's curse to bear", mnc, cosine_sim_headlines, indices)
print("Time taken: %s seconds" % (time.time() - start))
cosine_recc

start = time.time()
cosine_recc = get_news_recommendations("Indian rupee weakens past 81-mark for first time against US dollar", mnc, cosine_sim_headlines, indices)
print("Time taken: %s seconds" % (time.time() - start))
cosine_recc

start = time.time()
cosine_recc = get_news_recommendations("Cummins India shares test 4-month high as it provides tech for Gail's hydrogen plant", mnc, cosine_sim_headlines, indices)
print("Time taken: %s seconds" % (time.time() - start))
cosine_recc

"""# Calculating BM25 scores for each article and giving output based on the score and exact matches"""

lemmatizer = WordNetLemmatizer()

def smart_tokenize(txt):
  text = txt.translate(str.maketrans('', '', string.punctuation))
  tokens = nltk.word_tokenize(text.lower())
  return [lemmatizer.lemmatize(w) for w in tokens]

def transform_text(text):
  if not text:
        print('The text to be tokenized is a None type. Defaulting to blank string.')
        text = ''
  text = text.translate(str.maketrans('', '', string.punctuation))
  tokens = nltk.tokenize.word_tokenize(text)
  words = [word for word in tokens if word.isalpha()]
  words = [w for w in words if not w in stop_words]
  stemmed = [porter.stem(word) for word in words]
  return stemmed

mnc['text_new'] = mnc.apply(lambda row: ' '.join(transform_text(row.text)), axis=1)
mnc.head()

"""# BM25 on tokenized data and fetching feature names of article's text"""

tokenized_corpus = [doc.split(" ") for doc in mnc["text_new"]]
bm25 = BM25Okapi(tokenized_corpus)

vectorizer = TfidfVectorizer(use_idf=True)
vectors = vectorizer.fit_transform(mnc["text_new"])
words_set = vectorizer.get_feature_names_out()
print(words_set)

"""## 1. Take input from user as a search query
## 2. Tokenize the input
## 3. Assign score to the query
## 4. Check for exact matches in article's text
## 5. Sort and display the result based on high BM25 scores
"""

def smart_tokenize(txt):
  tokens = nltk.word_tokenize(txt.lower())
  return [lemmatizer.lemmatize(w) for w in tokens]

def exact_match(query, arr):
  count=0
  if type(query) == list:
    for i in query:
      if i.lower() in arr.lower():
        count += 1
        if count == len(query):
          return 1
      else:
        return 0
  elif query.lower() in arr.lower():
      return 1
  else:
    return 0
  if count>0:
    return count


def bm25_model(bm_query):
  for query in bm_query:
    print('Query:', query)
    t1 = time.time()
    tokenized_query = transform_text(query)
    print("tokenized_query: {}".format(tokenized_query))
    doc_scores = bm25.get_scores(tokenized_query)
    result = mnc[['headline','text', 'text_new', 'year']].copy()
    result['bm25score'] = pd.Series(doc_scores)
    tfid_query_vector = vectorizer.transform([x for x in tokenized_query])
    # print(vectors.shape)
    # print(tfid_query_vector.shape)
    result['exact'] = mnc.text_new.apply(lambda row: exact_match(tokenized_query, row))
    result.sort_values(['exact', 'bm25score'], ascending=[False, False], inplace=True)
    final_result = result[['headline','text','text_new', 'year', 'bm25score', 'exact']]
    ICD.display(final_result[final_result['bm25score'] > 0])
    t2 = time.time() - t1

  print('elapsed:', t2, 's')

bm_query = [input('Enter search query:')]
bm25_model(bm_query)

"""# Using get_top_n from BM25 module based on user input"""

query2 = input('Enter search query:')
tokenized_query = transform_text(query2)
print(tokenized_query)
docs = bm25.get_top_n(tokenized_query, mnc["text_new"], n=5)
df_search = mnc[mnc["text_new"].isin(docs)]
df_search.head()

"""# Testing the same model on another annotated dataset from Kaggle.

#Movie Dataset Recommendation

"""

# Commented out IPython magic to ensure Python compatibility.
from google.colab import drive
drive.mount('/content/drive')
# %cd '/content/drive/MyDrive/'

"""#Dataset taken from https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset/ [License CC0: Public Domain]
Saved in Google drive
"""

meta = pd.read_csv("movies_metadata.csv")
meta = meta[(meta['title'].notna()) & (meta['overview'].notna())]
meta = meta.drop_duplicates(subset="title", keep="first")
old_metadata = meta.reset_index()
metadata = old_metadata[1:10001]
metadata

metadata.loc[metadata["title"] == "Pokémon: Spell of the Unknown"]

# metadata['title'] = metadata['title'].drop_duplicates()
indices = pd.Series(metadata.index, index=metadata['title']).dropna()
indices

movie_plots = metadata['overview']
print(movie_plots)
tfidf = TfidfVectorizer()

# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(movie_plots.values.astype('U'))

start = time.time()

# Generate the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

print("Time taken: %s seconds" % (time.time() - start))

print(cosine_sim)

start = time.time()
cosine_org_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_org_sim)
print("Time taken: %s seconds" % (time.time() - start))

def get_recommendations(title, cosine_sim, indices):
    idx = indices[title]
    print(idx)
    # print(cosine_sim[idx])
    sim_scores = list(enumerate(cosine_sim[idx]))
    print(sim_scores)
    #Sort based on the sim scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores for 10 most similar movies
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    # Get the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

start = time.time()
print(get_recommendations("Strings", cosine_sim, indices))
print("Time taken: %s seconds" % (time.time() - start))

def get_recommendations(title, cosine_org_sim, indices):
    idx = indices[title]
    print(idx)
    # print(cosine_sim[idx])
    sim_scores = list(enumerate(cosine_org_sim[idx]))
    print(sim_scores)
    #Sort based on the sim scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores for 10 most similar movies
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    # Get the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

start = time.time()
print(get_recommendations("15 Minutes", cosine_org_sim, indices))
print("Time taken: %s seconds" % (time.time() - start))