In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 25)
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
movie = pd.read_csv('movies.csv')
credit = pd.read_csv('credits.csv')
movie.head()
credit.head()
# 合并两个数据集
credit.columns = ['id','cast', 'title', 'crew']
movie= movie.merge(credit, on='id')
movie.head()
movie.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4803 entries, 0 to 4802 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 budget 4803 non-null int64 1 genres 4803 non-null object 2 homepage 1712 non-null object 3 id 4803 non-null int64 4 keywords 4803 non-null object 5 original_language 4803 non-null object 6 original_title 4803 non-null object 7 overview 4800 non-null object 8 popularity 4803 non-null float64 9 production_companies 4803 non-null object 10 production_countries 4803 non-null object 11 release_date 4802 non-null object 12 revenue 4803 non-null int64 13 runtime 4801 non-null float64 14 spoken_languages 4803 non-null object 15 status 4803 non-null object 16 tagline 3959 non-null object 17 title_x 4803 non-null object 18 vote_average 4803 non-null float64 19 vote_count 4803 non-null int64 20 cast 4803 non-null object 21 title_y 4803 non-null object 22 crew 4803 non-null object dtypes: float64(3), int64(4), object(16) memory usage: 863.2+ KB
In [3]:
# 词云图
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
# 自定义一个画词云图的函数
def cloud(col):
wcloud = " ".join(f for f in movie[col])
wc_ = WordCloud(width = 2000, height = 1000, random_state=1, background_color='black', colormap='Set2', collocations=False, stopwords = STOPWORDS)
wc_.generate(wcloud)
plt.subplots(figsize=(10,6))
plt.imshow(wc_, interpolation="bilinear")
plt.axis("off")
In [4]:
# 画出标题列的词云图
cloud("original_title")
In [5]:
# 填充overview变量中的缺失值
movie["overview"] = movie["overview"].fillna("")
cloud("overview")
In [6]:
# Tfidf向量化
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(movie["overview"])
tfidf_matrix
Out[6]:
<4803x20978 sparse matrix of type '<class 'numpy.float64'>' with 125840 stored elements in Compressed Sparse Row format>
In [7]:
# 我们将使用sklearn的linear_kernel()而不是cosine_similarity(),因为它更快。
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# 索引和电影original_title的反向映射
indices = pd.Series(movie.index, index=movie['original_title']).drop_duplicates()
In [9]:
# 自定义一个推荐函数
def get_recommendation(title, cosine_sim):
idx = indices[title]
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:11]
movies = [i[0] for i in sim_scores]
movies = movie["original_title"].iloc[movies]
return movies
# 如果你搜索“Spectre”,下面的电影名称将被推荐
get_recommendation('Spectre', cosine_sim)
Out[9]:
1343 Never Say Never Again 4071 From Russia with Love 3162 Thunderball 1717 Safe Haven 11 Quantum of Solace 4339 Dr. No 29 Skyfall 1880 Dance Flick 3336 Diamonds Are Forever 1743 Octopussy Name: original_title, dtype: object
In [10]:
# 如果你搜索“John Carter”,下面的电影名称将被推荐
get_recommendation("John Carter", cosine_sim)
Out[10]:
1254 Get Carter 4161 The Marine 4: Moving Target 2932 Raising Cain 3349 Desperado 1307 The Hurricane 3068 Rescue Dawn 345 Rush Hour 2 581 Star Trek: Insurrection 2998 Devil 4274 Eddie: The Sleepwalking Cannibal Name: original_title, dtype: object
In [11]:
# 将字符串化后的特征解析为对应的python对象
from ast import literal_eval
features = ['keywords', 'genres']
for feature in features:
movie[feature] = movie[feature].apply(literal_eval)
movie[['original_title', 'keywords', 'genres']].head(3)
Out[11]:
| original_title | keywords | genres | |
|---|---|---|---|
| 0 | Avatar | [{'id': 1463, 'name': 'culture clash'}, {'id':... | [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam... |
| 1 | Pirates of the Caribbean: At World's End | [{'id': 270, 'name': 'ocean'}, {'id': 726, 'na... | [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... |
| 2 | Spectre | [{'id': 470, 'name': 'spy'}, {'id': 818, 'name... | [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam... |
In [12]:
# 提取类型列表
def list_genres(x):
l = [d['name'] for d in x]
return(l)
movie['genres'] = movie['genres'].apply(list_genres)
In [13]:
# 提取关键字列表
def list_keyword(y):
i = [a['name'] for a in y]
return(i)
movie['keywords'] = movie['keywords'].apply(list_keyword)
In [14]:
# 结合类型和关键词
def genre(x):
return ''.join(' '.join(x['genres']) + ' ' + ' '.join(x['keywords']))
movie['mix'] = movie.apply(genre, axis=1)
movie["mix"]
Out[14]:
0 Action Adventure Fantasy Science Fiction cultu...
1 Adventure Fantasy Action ocean drug abuse exot...
2 Action Adventure Crime spy based on novel secr...
3 Action Crime Drama Thriller dc comics crime fi...
4 Action Adventure Science Fiction based on nove...
...
4798 Action Crime Thriller united states–mexico bar...
4799 Comedy Romance
4800 Comedy Drama Romance TV Movie date love at fir...
4801
4802 Documentary obsession camcorder crush dream girl
Name: mix, Length: 4803, dtype: object
In [15]:
# 向量化
from sklearn.feature_extraction.text import CountVectorizer
countvect = CountVectorizer(stop_words="english")
countvect_mat = tfidf.fit_transform(movie["mix"])
countvect_mat
Out[15]:
<4803x7069 sparse matrix of type '<class 'numpy.float64'>' with 60983 stored elements in Compressed Sparse Row format>
In [16]:
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(countvect_mat, countvect_mat)
# 索引和电影original_title的反向映射
movie = movie.reset_index()
indices = pd.Series(movie.index, index=movie['original_title'])
# 如果你搜索“John Carter”,下面的电影名称将被推荐
get_recommendation("John Carter", cos_sim)
Out[16]:
373 Mission to Mars 2964 The Last Days on Mars 754 Planet 51 141 Mars Needs Moms 487 Red Planet 1319 Riddick 752 My Favorite Martian 0 Avatar 278 Planet of the Apes 541 Soldier Name: original_title, dtype: object
In [17]:
# 如果你搜索“Soldier”,下面的电影名称将被推荐
get_recommendation("Soldier", cos_sim)
Out[17]:
1319 Riddick 0 Avatar 278 Planet of the Apes 754 Planet 51 2047 Megiddo: The Omega Code 2 193 After Earth 1642 Splice 838 Alien³ 3158 Alien 4332 Silent Running Name: original_title, dtype: object
In [19]:
# 基于投票的过滤:人口统计学过滤
# avarage评级
avg = movie["vote_average"].mean()
# 我们将使用第90个百分位数作为截止点。换句话说,一部电影要想进入榜单,它必须比榜单上至少90%的电影获得更多的选票。
q = movie["vote_count"].quantile(0.9)
print(avg)
print(q)
6.092171559442016 1838.4000000000015
In [20]:
# 符合条件的电影
movies = movie[movie["vote_count"] >= q]
# weighted_rating函数
def weighted_rating(x, q=q, avg=avg):
v = x['vote_count']
R = x['vote_average']
# 根据IMDB公式计算
return (v/(v+q) * R) + (q/(q+v) * avg)
In [21]:
# 符合条件的影片
movies["score"] = movies.apply(weighted_rating, axis=1)
# 根据上面计算的分数对电影进行排序
movies = movies.sort_values('score', ascending=False)
# 打印前10部电影
listed = movies[['original_title', 'vote_count', 'vote_average', 'score', "popularity"]].head(10)
In [24]:
# 可视化
import seaborn as sns
plt.subplots(figsize=(10,6))
sns.barplot(x="score", y="original_title", data=listed, palette="Set2")
# sns.barplot(listed["score"], listed["original_title"], palette="Set2")
plt.title("Movie Vs Score")
plt.show()
In [25]:
# 热门电影
popular= movies.sort_values('popularity', ascending=False)
plt.figure(figsize=(12,4))
plt.barh(popular['original_title'].head(10),popular['popularity'].head(10), align='center',
color="#313131")
plt.gca().invert_yaxis()
plt.xlabel("Popularity")
plt.title("Popular Movies")
plt.show()
In [ ]: