全国院线总票房破50亿！影院复工后，哪些电影最受欢迎？文章代码

发表于2020-08-31 3886次查看

影院复工数据分析

# 导入包
import numpy as np 
import pandas as pd 

from pyecharts.charts import Bar, Map, Line, Page
from pyecharts import options as opts 
from pyecharts.globals import SymbolType, WarningType
WarningType.ShowWarning = False

影院复工地图-淘票票

复工率Top5省份

df_1 = pd.read_csv('../data/淘票票复工率Top5省份0827.csv') 
df_1 = df_1.sort_values('openCinemaRate') 
df_1

	boxOffice	boxOfficeStr	cityId	cityName	openCinemaCnt	openCinemaCntStr	openCinemaIncrCnt	openCinemaIncrCntStr	openCinemaRate	openCinemaRateStr	scheduleCnt	scheduleCntStr	showDate	soldSeatCnt	soldSeatCntStr
4	59522139	59.5万	999929	青海省	41	41	0	0	0.953	95.3%	1000	1000	20200827	16906	1.69万
3	970996899	971.0万	999916	河南省	510	510	0	0	0.964	96.4%	15119	1.51万	20200827	289174	28.92万
2	540238952	540.2万	999914	江西省	334	334	-5	-5	0.965	96.5%	7198	7198	20200827	158912	15.89万
1	370686517	370.7万	999920	广西	258	258	1	1	0.985	98.5%	5581	5581	20200827	104000	10.40万
0	27058030	27.1万	999926	西藏	23	23	1	1	1.000	100.0%	409	409	20200827	6764	6764

# 产生数据
x_data = df_1['cityName'].values.tolist() 
y_data = [float(i.split('%')[0]) for i in df_1['openCinemaRateStr'].values.tolist()]

# 条形图
bar1 = Bar(init_opts=opts.InitOpts(width='1350px', height='750px')) 
bar1.add_xaxis(x_data)
bar1.add_yaxis('', y_data, category_gap='50%') 
bar1.set_global_opts(title_opts=opts.TitleOpts(title='全国复工率Top5省份（8-27）'))
bar1.set_series_opts(label_opts=opts.LabelOpts(is_show=True, position='right')) 
bar1.set_colors('#F6A061')
bar1.reversal_axis()
bar1.render()

'C:\\Users\\wzd\\Desktop\\CDA\\CDA_Python\\Python项目实作\\网络爬虫\\电影\\电影复工分析\\code\\render.html'

全国影院复工地图

df_2 = pd.read_csv('../data/淘票票各省票房-0827.csv')

# 城市数据
df_2['cityName'] = df_2['cityName'].str.replace('市', '').str.replace('省', '')
df_2.head()

	boxOffice	boxOfficeStr	cityId	cityName	openCinemaCnt	openCinemaCntStr	openCinemaIncrCnt	openCinemaIncrCntStr	openCinemaRate	openCinemaRateStr	scheduleCnt	scheduleCntStr	showDate	soldSeatCnt	soldSeatCntStr
0	571133208	571.1万	99991	北京	206	206	3	3	0.866	86.6%	5729	5729	20200827	115052	11.51万
1	220772644	220.8万	99992	天津	90	90	0	0	0.833	83.3%	2636	2636	20200827	57467	5.75万
2	636961140	637.0万	99993	河北	384	384	-3	-3	0.901	90.1%	9807	9807	20200827	180247	18.02万
3	364804000	364.8万	99994	山西	264	264	-4	-4	0.917	91.7%	6163	6163	20200827	103178	10.32万
4	241007410	241.0万	99995	内蒙古	186	186	-7	-7	0.842	84.2%	4207	4207	20200827	68254	6.83万

open_num = df_2[['cityName', 'openCinemaCnt']]
open_num.head()

	cityName	openCinemaCnt
0	北京	206
1	天津	90
2	河北	384
3	山西	264
4	内蒙古	186

x_data = open_num.cityName.values.tolist()
y_data = open_num.openCinemaCnt.values.tolist() 

# 地图
map1 = Map(init_opts=opts.InitOpts(width='1350px', height='750px'))
map1.add("", [list(z) for z in zip(x_data, y_data)],
         maptype='china'
        ) 
map1.set_global_opts(title_opts=opts.TitleOpts(title='全国影院复工数地图（8-27）'),
                     visualmap_opts=opts.VisualMapOpts(max_=open_num.openCinemaCnt.max(), 
                                                       is_piecewise=True, 
#                                                        pieces=
                                                      ),
                    )
map1.render()

'C:\\Users\\wzd\\Desktop\\CDA\\CDA_Python\\Python项目实作\\网络爬虫\\电影\\电影复工分析\\code\\render.html'

全国影院复工数Top10

num_top10 = open_num.sort_values('openCinemaCnt', ascending=False).head(10) 
num_top10

	cityName	openCinemaCnt
18	广东	1092
9	江苏	813
10	浙江	663
22	四川	564
14	山东	517
15	河南	510
17	湖南	405
2	河北	384
16	湖北	374
11	安徽	363

x_data = num_top10.cityName.values.tolist()
y_data = num_top10.openCinemaCnt.values.tolist()

# 条形图
bar2 = Bar(init_opts=opts.InitOpts(width='1350px', height='750px')) 
bar2.add_xaxis(x_data)
bar2.add_yaxis('', y_data) 
bar2.set_global_opts(title_opts=opts.TitleOpts(title='全国影院复工数Top10省份（8-27日）'),
#                      xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
                     visualmap_opts=opts.VisualMapOpts(is_show=False, max_=1092)
                    ) 
bar2.render()

'C:\\Users\\wzd\\Desktop\\CDA\\CDA_Python\\Python项目实作\\网络爬虫\\电影\\电影复工分析\\code\\render.html'

	boxOffice	boxOfficeStr	cityId	cityName	openCinemaCnt	openCinemaCntStr	openCinemaIncrCnt	openCinemaIncrCntStr	openCinemaRate	openCinemaRateStr	scheduleCnt	scheduleCntStr	showDate	soldSeatCnt	soldSeatCntStr
0	422155895	422.2万	-5	全国	1397	1397	1009	1009	0.129	12.9%	12599	1.26万	20200720	164716	16.47万
1	537683369	537.7万	-5	全国	2040	2040	643	643	0.189	18.9%	23482	2.35万	20200721	215780	21.58万
2	644860406	644.9万	-5	全国	2567	2567	527	527	0.237	23.7%	35414	3.54万	20200722	274121	27.41万
3	662853406	662.9万	-5	全国	3152	3152	585	585	0.291	29.1%	46315	4.63万	20200723	309331	30.93万
4	2030300376	2030.3万	-5	全国	4670	4670	1518	1518	0.432	43.2%	83543	8.35万	20200724	757934	75.79万

df_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   boxOffice             39 non-null     int64  
 1   boxOfficeStr          39 non-null     object 
 2   cityId                39 non-null     int64  
 3   cityName              39 non-null     object 
 4   openCinemaCnt         39 non-null     int64  
 5   openCinemaCntStr      39 non-null     int64  
 6   openCinemaIncrCnt     39 non-null     int64  
 7   openCinemaIncrCntStr  39 non-null     int64  
 8   openCinemaRate        39 non-null     float64
 9   openCinemaRateStr     39 non-null     object 
 10  scheduleCnt           39 non-null     int64  
 11  scheduleCntStr        39 non-null     object 
 12  showDate              39 non-null     int64  
 13  soldSeatCnt           39 non-null     int64  
 14  soldSeatCntStr        39 non-null     object 
dtypes: float64(1), int64(9), object(5)
memory usage: 4.7+ KB

复工后票房数据.zip 预览下载

只看楼主倒序查看

6回复

2楼 CDA网校 • 2020-09-01

df_3['box_num'] = df_3.boxOfficeStr.str.extract(r'(\d+\.\d+)').astype('float') 
df_3['box_unit'] = df_3.boxOfficeStr.str.extract(r'([\u4e00-\u9fa5]+)').replace({'万': 10000, '亿': 100000000})
df_3['true_box'] = df_3['box_num'] * df_3['box_unit']
df_3['true_box'] = df_3['true_box'] / 10000 
df_3.drop(['box_num', 'box_unit'], axis=1, inplace=True)
df_3.head()

	boxOffice	boxOfficeStr	cityId	cityName	openCinemaCnt	openCinemaCntStr	openCinemaIncrCnt	openCinemaIncrCntStr	openCinemaRate	openCinemaRateStr	scheduleCnt	scheduleCntStr	showDate	soldSeatCnt	soldSeatCntStr	true_box
0	422155895	422.2万	-5	全国	1397	1397	1009	1009	0.129	12.9%	12599	1.26万	20200720	164716	16.47万	422.2
1	537683369	537.7万	-5	全国	2040	2040	643	643	0.189	18.9%	23482	2.35万	20200721	215780	21.58万	537.7
2	644860406	644.9万	-5	全国	2567	2567	527	527	0.237	23.7%	35414	3.54万	20200722	274121	27.41万	644.9
3	662853406	662.9万	-5	全国	3152	3152	585	585	0.291	29.1%	46315	4.63万	20200723	309331	30.93万	662.9
4	2030300376	2030.3万	-5	全国	4670	4670	1518	1518	0.432	43.2%	83543	8.35万	20200724	757934	75.79万	2030.3

3楼 CDA网校 • 2020-09-01

x_data = open_trend['showDate'].astype('str').str[-4:-2] + '-' + df_3['showDate'].astype('str').str[-2:]
x_data = x_data.values.tolist()
y_data = open_trend.openCinemaCnt.values.tolist()

line2 = Line(init_opts=opts.InitOpts(width='1350px', height='750px'))
line2.add_xaxis(x_data)
line2.add_yaxis('', y_data, 
                markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="min"),
                                                        opts.MarkPointItem(type_="max")]),
#                 areastyle_opts=opts.AreaStyleOpts(opacity=0.5, color="#C67570"),
               ) 
line2.set_global_opts(title_opts=opts.TitleOpts(title='全国影院复工趋势图'), 
                      xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate='30')),
                      yaxis_opts=opts.AxisOpts(name='', type_='value', 
                                               axislabel_opts=opts.LabelOpts(formatter="{value}"))
                     )
line2.set_series_opts(label_opts=opts.LabelOpts(is_show=False), 
                      linestyle_opts=opts.LineStyleOpts(width=3)
                     )
line2.render()

'C:\\Users\\wzd\\Desktop\\CDA\\CDA_Python\\Python项目实作\\网络爬虫\\电影\\电影复工分析\\code\\render.html'

4楼 CDA网校 • 2020-09-01

猫眼电影票房分析

复工以来不同影片累计票房排名前十

df_4 = pd.read_csv('../data/猫眼20200720-0827.csv')
df_4.head()

	movieName	releaseInfo	sumBoxDesc	boxSplitUnit	boxRate	showCount	showCountRate	avgShowView	avgSeatView	select_date
0	第一次的离别	上映首日	154.1万	152.12万	36.0%	3392	26.9%	17	13.8%	2020-07-20
1	误杀	NaN	12.07亿	60.58万	14.3%	1240	9.8%	19	20.5%	2020-07-20
2	哪吒之魔童降世	NaN	50.13亿	57.40万	13.5%	716	5.6%	17	10.9%	2020-07-20
3	寻梦环游记	NaN	12.12亿	42.55万	10.0%	896	7.1%	21	13.4%	2020-07-20
4	白蛇：缘起	NaN	4.55亿	23.21万	5.4%	340	2.6%	20	17.8%	2020-07-20

5楼 CDA网校 • 2020-09-01

 # 计算单日票房
df_4['box_num'] = df_4.boxSplitUnit.str.extract(r'(\d+\.\d+)').astype('float') 
df_4['box_unit'] = df_4.boxSplitUnit.str.extract(r'([\u4e00-\u9fa5]+)').replace({'万': 10000, '亿': 100000000})
df_4['unit_box'] = df_4['box_num'] * df_4['box_unit']
df_4.drop(['box_num', 'box_unit'], axis=1, inplace=True)
df_4.head()

	movieName	releaseInfo	sumBoxDesc	boxSplitUnit	boxRate	showCount	showCountRate	avgShowView	avgSeatView	select_date	unit_box
0	第一次的离别	上映首日	154.1万	152.12万	36.0%	3392	26.9%	17	13.8%	2020-07-20	1521200.0
1	误杀	NaN	12.07亿	60.58万	14.3%	1240	9.8%	19	20.5%	2020-07-20	605800.0
2	哪吒之魔童降世	NaN	50.13亿	57.40万	13.5%	716	5.6%	17	10.9%	2020-07-20	574000.0
3	寻梦环游记	NaN	12.12亿	42.55万	10.0%	896	7.1%	21	13.4%	2020-07-20	425500.0
4	白蛇：缘起	NaN	4.55亿	23.21万	5.4%	340	2.6%	20	17.8%	2020-07-20	232100.0

6楼 CDA网校 • 2020-09-01

df_sumbox = pd.DataFrame() 

for name in df_4.movieName.drop_duplicates():
    # 计算累加
    df_onemovie = df_4[df_4['movieName'] == name][['movieName', 'select_date', 'unit_box']]
    df_onemovie['cum_box'] = df_onemovie.unit_box.cumsum() 
    # 追加
    df_sumbox = df_sumbox.append(df_onemovie, ignore_index=True)
df_sumbox.head()

# 复制一份
df_sumbox2 = df_sumbox.copy()

# 筛选日期
df_sumbox = df_sumbox[df_sumbox['select_date'] == '2020-08-27']
df_sumbox.head()

df_sumbox.loc[:, '累计票房(万)'] = (df_sumbox['cum_box'].copy() / 10000)
df_sumbox.head()

box_top10 = df_sumbox.sort_values('cum_box', ascending=False)
box_top10 = box_top10.head(10)
box_top10

x_data = box_top10.movieName.values.tolist()
y_data = box_top10['累计票房(万)'].values.tolist()

# 条形图
bar3 = Bar(init_opts=opts.InitOpts(width='1350px', height='750px')) 
bar3.add_xaxis(x_data)
bar3.add_yaxis('', y_data) 
bar3.set_global_opts(title_opts=opts.TitleOpts(title='复工以来影片累计票房排名Top10'),
                     xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
                     yaxis_opts=opts.AxisOpts(name='', type_='value', 
                                              axislabel_opts=opts.LabelOpts(formatter="{value}万")),
                     visualmap_opts=opts.VisualMapOpts(is_show=False, max_=143361.58)
                    ) 
bar3.render()

'C:\\Users\\wzd\\Desktop\\CDA\\CDA_Python\\Python项目实作\\网络爬虫\\电影\\电影复工分析\\code\\render.html'

7楼 CDA网校 • 2020-09-01

影院复工后累计票房动图

df_sumbox2['cum_box'] = df_sumbox2['cum_box']/10000
df_sumbox2.head()

df_dongtu = pd.pivot_table(data=df_sumbox2, values='cum_box', index='movieName', 
                           columns='select_date', aggfunc='sum')
df_dongtu = df_dongtu.reset_index()
df_dongtu

df_dongtu.to_excel('../data/复工动图数据.xlsx', index=False)

# import plotly as py 
# import plotly.graph_objs as go 
# import plotly.express as px 
# from plotly import tools

# # 条形图
# fig1 = px.bar(df_sumbox2, x='movieName', y='cum_box', color='movieName', text='cum_box',
#              title=' 影院复工后累计票房动图', 
#              animation_frame='select_date',
#              range_y=[0, 143361.58],
# #              orientation='v'
#              ) 
# fig1.update_layout(xaxis_title='影片', yaxis_title='累计票房（万）')
# py.offline.plot(fig1, filename=' 影院复工后累计票房动图.html')

' 影院复工后累计票房动图.html'

px.bar_polar()

生成page

page = Page()

page.add(bar1, map1, bar2, line2, line1, bar3, bar4)
page.render('电影复工数据分析.html')

'C:\\Users\\wzd\\Desktop\\CDA\\CDA_Python\\Python项目实作\\网络爬虫\\电影\\电影复工分析\\code\\电影复工数据分析.html'

发表回复

你还没有登录，请先登录或注册！

(大小不能超过 2 MB，文件类型支持png ,jpg ,gif ,doc ,xls ,txt ,rar ,zip .)

文件名	描述	金币	操作

Python专区加入小组

全国院线总票房破50亿！影院复工后，哪些电影最受欢迎？文章代码

影院复工数据分析

影院复工地图-淘票票

复工率Top5省份

全国影院复工地图

全国影院复工数Top10

猫眼电影票房分析

复工以来不同影片累计票房排名前十

影院复工后累计票房动图

生成page

新加组员

关于我们

CDA认证

常见问题

开源社区

服务条款

Python专区 加入小组

全国院线总票房破50亿！影院复工后，哪些电影最受欢迎？文章代码

影院复工数据分析

影院复工地图-淘票票

复工率Top5省份

全国影院复工地图

全国影院复工数Top10

猫眼电影票房分析

复工以来不同影片累计票房排名 前十

影院复工后累计票房动图

生成page

新加组员

关于我们

CDA认证

常见问题

开源社区

服务条款

Python专区加入小组

复工以来不同影片累计票房排名前十