python 数据挖掘-我分析了《乘风破浪的姐姐》,发现了这些秘密
需要数据集+源码关注加点赞 + 截图加qq群:606115027
读取数据
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv(“/home/kesci/input/sister5122/final_data.csv”, encoding=’gbk’)
df.names = [i.strip(‘\r\n’) for i in df.names]
df
age birth hometown names primaryScore jobs picUrl
0 (51岁) 1969/3/4 台湾 伊能静 74 歌手,演员, bkimg.cdn.bcebos.com/pic/c2fdfc039...…
1 (35岁) 1984/10/15 河南 海陆 68 演员, bkimg.cdn.bcebos.com/pic/9345d688d...…
2 (34岁) 1985/10/21 吉林 许飞 75 歌手, bkimg.cdn.bcebos.com/pic/f703738da...…
3 (33岁) 1986/8/8 山东 张雨绮 72 演员, bkimg.cdn.bcebos.com/pic/b21c8701a...…
4 (31岁) 1989/4/9 四川 张含韵 79 歌手,演员, bkimg.cdn.bcebos.com/pic/0823dd545...…
5 (37岁) 1983/1/29 辽宁 吴昕 74 演员,主持人, bkimg.cdn.bcebos.com/pic/8644ebf81...…
6 (35岁) 1985/3/22 内蒙古 王丽坤 72 演员, bkimg.cdn.bcebos.com/pic/37d3d539b...…
7 (37岁) 1983/3/14 上海 金莎 68 歌手,演员, bkimg.cdn.bcebos.com/pic/5366d0160...…
8 (30岁) 1990/4/16 上海 蓝盈莹 91 演员, bkimg.cdn.bcebos.com/pic/10dfa9ec8...…
9 (37岁) 1983/2/11 上海 黄圣依 80 歌手,演员, bkimg.cdn.bcebos.com/pic/64380cd79...…
10 (39岁) 1981/3/6 天津 张萌 77 演员, bkimg.cdn.bcebos.com/pic/6d81800a1...…
11 (29岁) 1990/9/5 山东 金晨 80 模特,演员, bkimg.cdn.bcebos.com/pic/5bafa40f4...…
12 (32岁) 1988/4/18 云南 朱婧汐 76 歌手,音乐人, bkimg.cdn.bcebos.com/pic/728da9773...…
13 (48岁) 1972/4/27 贵州 宁静 84 演员, bkimg.cdn.bcebos.com/pic/574e9258d...…
14 (30岁) 1990/2/3 湖南 孟佳 87 歌手,演员, bkimg.cdn.bcebos.com/pic/2934349b0...…
15 (36岁) 1983/10/23 四川 郁可唯 85 歌手, bkimg.cdn.bcebos.com/pic/42166d224...…
16 (33岁) 1987/4/27 海南 王霏霏 84 歌手,演员, bkimg.cdn.bcebos.com/pic/267f9e2f0...…
17 (42岁) 1978/4/17 湖南 阿朵 79 歌手,演员, bkimg.cdn.bcebos.com/pic/f7246b600...…
18 (49岁) 1970/9/19 加拿大 钟丽缇 78 演员, bkimg.cdn.bcebos.com/pic/ac4bd1137...…
19 (38岁) 1981/9/6 上海 郑希怡 84 歌手,演员, bkimg.cdn.bcebos.com/pic/a8014c086...…
20 (30岁) 1990/4/26 四川 李斯丹妮 87 歌手,演员, bkimg.cdn.bcebos.com/pic/4e4a20a44...…
21 (37岁) 1982/12/26 湖南 刘芸 74 演员, bkimg.cdn.bcebos.com/pic/d8f9d72a6...…
22 (34岁) 1986/5/2 陕西 白冰 79 演员, bkimg.cdn.bcebos.com/pic/c8ea15ce3...…
23 (37岁) 1982/7/29 辽宁 王智 68 演员, bkimg.cdn.bcebos.com/pic/d1160924a...…
24 (33岁) 1987/2/13 上海 黄龄 89 歌手,演员, bkimg.cdn.bcebos.com/pic/4610b912c...…
25 (33岁) 1986/11/14 美国 袁咏琳 83 歌手,演员, bkimg.cdn.bcebos.com/pic/060828381...…
26 (38岁) 1982/4/17 浙江 丁当 75 歌手, bkimg.cdn.bcebos.com/pic/d01373f08...…
27 (38岁) 1982/5/14 湖南 万茜 77 歌手,演员, bkimg.cdn.bcebos.com/pic/5bafa40f4...…
28 (31岁) 1989/6/13 湖南 沈梦辰 86 演员,主持人, bkimg.cdn.bcebos.com/pic/3801213fb...…
29 (49岁) 1971/1/21 香港 陈松伶 73 歌手,演员, bkimg.cdn.bcebos.com/pic/b90e7bec5...…
df.age = [int(i.strip().replace(‘(’, ‘’).replace(‘)’,’’).replace(‘岁’,’’)) for i in df.age.values]
from pyecharts import Pie, Bar, Line
from pyecharts.charts import Pie, Bar, Line, Funnel
from pyecharts.options.global_options import ThemeType
from pyecharts import options as opts
from pyecharts.charts import Pie, Bar, line
姐姐的年龄分布
attr = []
count = []
age_cut = pd.cut(df.age, [26,33,40,47,54], labels=[u”26-33”,u”33-40”,u”40-47”,u”47-54”]) # 对年龄进行分段划分
for i, j in age_cut.value_counts().items():
attr.append(i)
count.append(j)
pie = (Pie(init_opts=opts.InitOpts(
theme=ThemeType.CHALK
)).add(‘’, [list(z) for z in zip(attr, count)],
radius=[“30%”, “75%”],rosetype=”radius”)
.set_global_opts(title_opts=opts.TitleOpts(title=”《乘风破浪的姐姐》”, subtitle=”年龄分布”))
.set_series_opts(label_opts=opts.LabelOpts(formatter=”{b}: {d}%”))
)
pie.render_notebook()
姐姐的职业分布
from collections import Counter
jobsClass = Counter(filter(None, ‘’.join(df.jobs.values).split(‘,’)))
funnel = (Funnel(init_opts=opts.InitOpts(
theme=ThemeType.CHALK
))
.add(“《乘风破浪的姐姐》”, [list(z) for z in zip(jobsClass.keys(), jobsClass.values())],
sort_=’ascending’,
label_opts=opts.LabelOpts(position=”inside”))
.set_global_opts(title_opts=opts.TitleOpts(title=”《乘风破浪的姐姐》”, subtitle=”职业分布”),)
)
funnel.render_notebook()
姐姐的省份分布
from pyecharts.charts import Map
import random
provinces = Counter(df.hometown)
print(provinces)
area = [(i[0],i[1]) for i in provinces.items()]
maps = (
Map(init_opts=opts.InitOpts(
theme=ThemeType.ROMANTIC
))
.add(“出生地”, area, “china”)
.set_global_opts(
title_opts=opts.TitleOpts(title=”Map-基本示例”),
legend_opts=opts.LegendOpts(is_show=False),
visualmap_opts=opts.VisualMapOpts(max_=5, is_piecewise=True),
)
)
maps.render_notebook()
Counter({‘上海’: 5, ‘湖南’: 5, ‘四川’: 3, ‘山东’: 2, ‘辽宁’: 2, ‘台湾’: 1, ‘河南’: 1, ‘吉林’: 1, ‘内蒙古’: 1, ‘天津’: 1, ‘云南’: 1, ‘贵州’: 1, ‘海南’: 1, ‘加拿大’: 1, ‘陕西’: 1, ‘美国’: 1, ‘浙江’: 1, ‘香港’: 1})
姐姐的年龄和初始舞台评分关系
from pyecharts import options as opts
from pyecharts.charts import Bar, Line
top5 = df[:5]
names = top5.names.values.tolist()
ages = top5.age.values.tolist()
scores = top5.primaryScore.values.tolist()
bar = (
Bar(init_opts=opts.InitOpts(
theme=ThemeType.ROMANTIC
))
.add_xaxis(names)
.add_yaxis(“年龄”, ages)
.extend_axis(
yaxis=opts.AxisOpts(
axislabel_opts=opts.LabelOpts(formatter=”{value}分”), interval=20
)
)
.extend_axis(
yaxis=opts.AxisOpts(
axislabel_opts=opts.LabelOpts(formatter=”{value}分”), interval=20
)
)
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(
title_opts=opts.TitleOpts(title="《乘风破浪的姐姐》"),
yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(formatter="{value}岁"), min_=0, max_=40),
)
)
line = Line().add_xaxis(names).add_yaxis(“初舞台评分”, scores, yaxis_index=1)
bar.overlap(line)
bar.render_notebook()
姐姐的平均年龄
print(sum(ages) / 5)
36.8
本作品采用《CC 协议》,转载必须注明作者和本文链接