Oscar Prediction 2020 Part I -
Web Scraping & Data Preparation
I scraped the IMDB website to get the nominees list for 10 different awards for the last 25 years.
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import numpy as np
import re
import requests as rq
from datetime import datetime
pd.options.display.max_rows = 5
pd.options.display.max_columns = 28
browser = webdriver.Chrome(executable_path="D:/chromedriver.exe")
I scraped the IMDB website to get the nominees list for 10 different awards for the last 25 years.
award_show_list = pd.read_csv("E:/Downloads/award_show_list.csv")
award_show_list
awards_all = pd.DataFrame()
for award_show in range(len(award_show_list)):
award_show_name = award_show_list['award_show'][award_show]
award_show_url = award_show_list['url'][award_show]
browser.get(award_show_url)
soup = BeautifulSoup(browser.page_source, 'lxml')
find = soup.find('div', {'class': 'event-history-widget'}).find_all('a')
award_years = [int(ele.text.strip()) for ele in find]
award_urls = ['https://www.imdb.com/'+ele.get('href') for ele in find]
award_dict = {"award_years" : award_years, "final_urls" : award_urls}
award_url_df = pd.DataFrame(award_dict)
ref_year = 1
row_num = 1
while ref_year <= 25 and row_num <= len(award_url_df):
url = award_url_df['final_urls'][row_num-1]
year = award_url_df['award_years'][row_num-1]
browser.get(url)
soup = BeautifulSoup(browser.page_source, 'lxml')
number_of_awards = len(soup.find_all('div', {'class': 'event-widgets__award-category'})) - len(soup.find_all('div', {'class': 'event-widgets__award-name'})) + 1
if number_of_awards > 1:
for award in range(number_of_awards):
award_name = soup.findAll('div', {'class': 'event-widgets__award-category-name'})[award]
award_name = award_name.text.strip()
nominees = soup.findAll('div', {'class': 'event-widgets__award-category'})[award].find_all('div', {'class': 'event-widgets__primary-nominees'})
nominees = [ele.text.strip() for ele in nominees]
imdb_urls =[]
for n in range(len(soup.findAll('div', {'class': 'event-widgets__award-category'})[award].findAll('div', {'class': 'event-widgets__primary-nominees'}))):
try:
imdb_urls.append('https://www.imdb.com/'+soup.findAll('div', {'class': 'event-widgets__award-category'})[award].findAll('div', {'class': 'event-widgets__primary-nominees'})[n].find('a').get('href'))
except:
imdb_urls.append(None)
details = soup.findAll('div', {'class': 'event-widgets__award-category'})[award].find_all('div', {'class': 'event-widgets__secondary-nominees'})
details = [ele.text.strip() for ele in details]
temp_df = {"nominees" : nominees,"details" : details,"imdb_urls":imdb_urls}
temp_df = pd.DataFrame(temp_df)
temp_df['award_show'] = award_show_name
temp_df['year'] = year
temp_df['award'] = award_name
temp_df['winner'] = [1 if index == 0 else 0 for index in temp_df.index]
temp_df['ref_year'] = ref_year
awards_all = pd.concat([awards_all, temp_df])
ref_year += 1
row_num += 1
else:
row_num += 1
awards_all.head()
I scraped the IMDB website to get some movie features (e.g. box office revenue, duration, budget) for all the Oscar nominees.
Here I filtered out all the oscar best picture nominees.
oscar = awards_all[awards_all['award_show'] == 'Academy Awards']
oscar_picture = oscar[oscar['award'].isin(['Best Motion Picture of the Year','Best Picture'])]
oscar_picture = oscar_picture
oscar_picture
oscar_picture = oscar_picture.reset_index(drop=True)
metabase1 = []
rating1 = []
popularity1 = []
genres1 = []
budget1 = []
gross1 = []
minute1 = []
#for row_num in range(5):
for row_num in range(len(oscar_picture)):
imdb_url = oscar_picture['imdb_urls'][row_num]
browser.get(imdb_url)
soup = BeautifulSoup(browser.page_source, 'lxml')
metabase = soup.find('div', {'class': 'titleReviewBarItem'})
metabase = metabase.text.strip()
pattern = re.compile(r'(\d+,?)+')
metabase = pattern.findall(re.sub(',', "", metabase))
metabase1.append(metabase[0])
rating = soup.find('div', {'class': 'ratingValue'}).text.strip()
rating = re.sub(r'/.*$', "", rating)
rating1.append(rating)
try:
popularity = soup.findAll('span', {'class': 'subText'})[2].text.strip()
pattern4 = re.compile(r'(\d+)\n')
if '\n' in popularity:
popularity = pattern4.findall(popularity)[0]
popularity1.append(popularity)
except:
popularity1.append(None)
genres =[]
for n in range(len(soup.findAll('div', {'class': 'see-more inline canwrap'})[1].find_all('a'))):
genres.append(soup.findAll('div', {'class': 'see-more inline canwrap'})[1].findAll('a')[n].text.strip())
genres1.append(genres)
budget = soup.find('div', {'id': 'titleDetails'}).findAll('div', {'class': 'txt-block'})[6].text.strip()
pattern1 = re.compile(r'(\d+)+\n')
try:
budget = pattern1.findall(re.sub(',', "", budget))[0]
budget1.append(budget)
except:
budget1.append(None)
gross = soup.find('div', {'id': 'titleDetails'}).text.strip()
pattern2 = re.compile(r'Cumulative Worldwide Gross\:.*\$(\d+)')
gross = pattern2.findall(re.sub(',', "", gross))[0]
gross1.append(gross)
minute = soup.find('div', {'id': 'titleDetails'}).text.strip()
pattern3 = re.compile(r'(\d+).*min')
try:
minute = pattern3.findall(minute)[0]
minute1.append(minute)
except:
minute1.append(None)
oscar_picture = oscar_picture.assign(metabase = metabase1,rating = rating1, popularity = popularity1, genres = genres1, budget = budget1, gross = gross1, minute = minute1)
oscar_picture.head()
As for all the Oscar nominees, I would create several new features to specify whether they have won other awards. If it won, the value for that award feature would set as 1. If it was only nominated but not won, the value would be 0. If it was not even nominated, the value would be -1.
other = awards_all[awards_all['award_show'] != 'Academy Awards']
other_picture = other[other['award'].isin(["Best Edited Feature Film",
"Best Edited Feature Film - Dramatic",
"Best Edited Feature Film - Comedy or Musical",
"Best Edited Feature Film - Comedy",
"Best Film",
"Best Picture", "Best Motion Picture, Comedy or Musical",
"Best Motion Picture, Drama", "Best Motion Picture - Drama",
"Best Motion Picture - Musical or Comedy",
"Best Motion Picture - Comedy or Musical", "Best Motion Picture"])][['nominees','award_show','year','winner']]
all_picture= pd.merge(oscar_picture,other_picture,on=['nominees','year'],how='left')
all_picture1 = all_picture.pivot(index='nominees', columns='award_show_y', values='winner_y').fillna(-1)
all_picture2 = pd.merge(oscar_picture,all_picture1,on=['nominees'],how='left')
oscar_picture.head()
Then, for each oscar nominee movie, I want to scrap its critics review on Rotten Tomatoes website in order to perform some text mining and predictions.
I considered the Oscar Dates here because I only want the reviews before the award date. I believe the reviews released after the movie won the Oscar award would be biased.
soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/List_of_Academy_Awards_ceremonies').text, 'lxml')
table = soup.find('table', {'class': 'wikitable'}).find('tbody')
table = table.findNext('table', {'class': 'wikitable'}).find('tbody')
data = []
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
dates = [x[1] for x in data[1:] if len(x) >= 2]
dates1=[datetime.strptime(ele, '%B %d, %Y').year for ele in dates]
a=[dates,dates1]
data=pd.DataFrame(a)
data=data.T
data.rename(columns={0:'date',1:'year'},inplace=True)
oscar_picture1 = pd.merge(oscar_picture,data,how='left')
Here I used the OMDb API to obtain the Rotten Tomatoes website URL for each movie based on their IMDb website URL.
pattern4 = re.compile(r'(tt\d{7})')
oscar_picture['tt'] = 'test'
for n in range(len(oscar_picture1)):
oscar_picture1.loc[n,'tt'] = pattern4.findall(oscar_picture.loc[n,'imdb_urls'])
OMDB_API_KEY = '14631071'
tomatoURL = []
for n in range(len(oscar_picture1)):
resp = rq.get('http://www.omdbapi.com/?i='+oscar_picture1.loc[n,'tt']+'&apikey='+OMDB_API_KEY+'&tomatoes=true')
json = resp.json()
tomatoURL.append(json['tomatoURL'])
oscar_picture1['tomatoURL']=pd.Series(tomatoURL)
oscar_picture1['tomatoURL'].head()
Then we can successfully get the critics review we want.
def getPageSocre(review_url):
reviews = soup.find('div', {'class': 'review_table'})
cur = reviews.findNext('div', {'class': 'review-link'})
date = reviews.findNext('div', {'class': 'review-date'})
content = reviews.findNext('div', {'class': 'the_review'})
while cur is not None and date is not None:
if pattern6.findall(cur.text.strip()) is not None:
try:
score_list.append(pattern6.findall(cur.text.strip())[0])
date_list.append(date.text.strip())
con_list.append(content.text.strip())
cur = cur.findNext('div', {'class': 'review-link'})
date = date.findNext('div', {'class': 'review-date'})
content = content.findNext('div', {'class': 'the_review'})
except:
cur = cur.findNext('div', {'class': 'review-link'})
date = date.findNext('div', {'class': 'review-date'})
content = content.findNext('div', {'class': 'the_review'})
import time
pattern6 = re.compile(r'(\d+\.?\d*?\/\d+)')
base_url = 'https://www.rottentomatoes.com/'
data = pd.DataFrame()
for n in range(len(oscar_picture1)):
url = oscar_picture1['tomatoURL'][n]
name = oscar_picture1['nominees'][n]
review_url = url+'/reviews'
browser.get(review_url)
time.sleep(4)
soup = BeautifulSoup(browser.page_source, 'lxml')
page_section = soup.find('span', {'class': 'pageInfo'})
next_page = page_section.findNext('a', {'class': 'btn btn-xs btn-primary-rt'})['href']
score_list = []
con_list = []
date_list = []
while next_page is not None:
browser.get(review_url)
time.sleep(4)
soup = BeautifulSoup(browser.page_source, 'lxml')
getPageSocre(review_url)
page_section = soup.find('span', {'class': 'pageInfo'})
if len(soup.findAll('a', {'class': 'btn btn-xs btn-primary-rt'})) != 4 and review_url != url+'/reviews':
next_page = None
else:
next_page = page_section.findNext('a', {'class': 'btn btn-xs btn-primary-rt'})['href']
review_url = base_url + next_page
score_list1=[]
con_list1 = []
for i in range(len(score_list)):
if datetime.strptime(date_list[i], '%B %d, %Y') < datetime.strptime(oscar_picture1['date'][n], '%B %d, %Y'):
score_list1.append(float(score_list[i].split('/')[0])/float(score_list[i].split('/')[1]))
con_list1.append(con_list[i])
c={"score" : score_list1,
"review" : con_list1,
"nominees" : name}
data_temp=pd.DataFrame(c)
data = pd.concat([data,data_temp])
oscar_picture2 = pd.merge(oscar_picture1,data)
oscar_picture2.head()
oscar_picture2.to_csv('6.csv')