Python课程作业

课程论文

Python课程最终考核方式是课程论文,要求就是做一个数据分析,提交文档、源码及数据源。

数据源

我的数据源来自和鲸社区世界幸福报告

目标

分析的主要目标有,本次调查所有国家或地区的全世界区域数量分布状况,由此展开区域整体幸福感对比,如各大地区幸福指数对比,各大地区幸福指数区间分布,2020年世界幸福报告前10名和后10名,最后通过可视化,如热力图分布,直观体现幸福指数与调查项目中相关关系,幸福地图,直观体现各国家地区幸福指数对比。

方法及类库

其中主要分析方法是Python的numpy和pandas的相关数据分析方法,再加上一些可视化库方法主要有matplotlib和seaborn,更重要的是pyecharts的各种工具,库里的很多可视化都非常漂亮,尤其是世界地图。经过分析数据和可视化,得出分析结论。

参考

世界幸福指数分析哪个国家最幸福

数据可视化--世界幸福指数报告

世界幸福报告数据分析

开始

类库导入

1
2
3
4
5
6
7
8
9
10
11
12
13
import numpy as np
import pandas as pd

# 可视化
import matplotlib.pyplot as plt
import seaborn as sns
from pyecharts.charts import Bar, Map, Line, Page, Pie, Grid
from pyecharts import options as opts
import statsmodels.api as sm
import statsmodels.formula.api as smf

plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号

读取数据及数据处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 读入数据与数据处理
s = r'C:\Users\86183\Desktop\课程论文\data'
df_2018 = pd.read_csv(s+'/2018.csv', encoding='gbk')
df_2019 = pd.read_csv(s+'/2019.csv', encoding='gbk')
df_2020 = pd.read_csv(s+'/2020.csv', encoding='gbk')

# 删除不必要的信息
df_2020 = df_2020.iloc[:, :12]
columns = ['Standard error of ladder score', 'upperwhisker', 'lowerwhisker']
df_2020.drop(columns, axis=1, inplace=True)

# 数据信息
def data_info():
print(df_2020.info())
print(df_2020.describe())
数据信息

主体

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# 幸福地图
def HappinessMap():
x_data = df_2020['Country name'].tolist()
y_data = df_2020['Ladder score'].round(2).tolist()

# 地图
map1 = Map()
map1.add('', [list(z) for z in zip(x_data, y_data)], maptype='world',)
map1.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
map1.set_global_opts(title_opts=opts.TitleOpts(title='Happiness Score'),
visualmap_opts=opts.VisualMapOpts(max_=8, is_piecewise=True,
pieces=[{'max': 3, 'min': 0, 'label': '0-3'},
{'max': 4, 'min': 3.1,
'label': '3-4'},
{'max': 5, 'min': 4.1,
'label': '4-5'},
{'max': 6, 'min': 5.1,
'label': '5-6'},
{'max': 7, 'min': 6.1,
'label': '6-7'},
{'max': 8, 'min': 7.1, 'label': '7-8'}]
),
)
map1.render("幸福地图.html")

幸福地图
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# 各大地区的平均幸福指数
def country_score_of_every_region():
df_m = df_2020.groupby('Regional indicator')['Ladder score'].mean().sort_values()
x_data = df_m .index.tolist()
y_data = df_m .values.round(2).tolist()
grid = Grid()
bar1 = Bar()
bar1.add_yaxis('',y_data)
bar1.add_xaxis(x_data)
bar1.set_global_opts(title_opts=opts.TitleOpts(title='Average Happiness Score by Regional Indicator'))
bar1.set_series_opts(label_opts=opts.LabelOpts(is_show=True, position='right'),
markline_opts=opts.MarkLineOpts(
data=[opts.MarkLineItem(type_="average", name="平均值")]))
bar1.set_colors('CadetBlue')
bar1.reversal_axis()

grid.add(bar1, grid_opts=opts.GridOpts(pos_left="20%"))

grid.render("各大地区的平均幸福指数.html")

得分
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# 各幸福区间的百分比
def regions_country_pie():
df2 = df_2020.iloc[:,:3]
df2['score_group'] = pd.cut(x=df2['Ladder score'],bins=[0,1,2,3,4,5,6,7,8]).astype('str')
c = (
Pie()
.add(
'',
[list(z) for z in zip(df2.groupby('score_group').score_group.count().index.tolist(),df2.groupby('score_group').score_group.count().values.tolist())],
radius=["30%", "75%"],
rosetype="area",
label_opts = opts.LabelOpts(position='right'),
)
.set_global_opts(
title_opts=opts.TitleOpts(title='Number of Countries by Happiness Score Categories'),
legend_opts=opts.LegendOpts(is_show=False),
)
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}:\n {c} ({d}%)"))
)
c.render("各幸福区间的百分比.html")
比例
1
2
3
4
5
6
7
8
9
10
# 所有国家在不同区域内的分部
def every_regions_country_num():
plt_cols = ['Regional indicator', 'Ladder score', 'Country name']
plt_da = df_2020[plt_cols]
plt.figure()
sns.countplot(x='Regional indicator', data=plt_da)
plt.xticks(rotation='vertical')
plt.title('Country name count of Regional indicator')
plt.show()

分布
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 2020前10后10
def Top_and_Bottem_10_country_happiness():
rank_top10 = df_2020.head(
10)[['Country name', 'Ladder score']]
last_top10 = df_2020.tail(
10)[['Country name', 'Ladder score']]
plt.figure()
ax = plt.subplot(1, 2, 1)
sns.barplot(x='Country name', y='Ladder score', data=rank_top10)
plt.xticks(rotation='vertical')
plt.title('Top 10 Country name')
ax = plt.subplot(1, 2, 2, sharey=ax)
sns.barplot(x='Country name', y='Ladder score', data=last_top10)
plt.xticks(rotation='vertical')
plt.title('Bottem 10 Country name')
plt.show()

2020前10后10
1
2
3
4
5
6
7
8
# 幸福指数相关性热力图
def Heat_map():
cor = df_2020.corr()
plt.figure(figsize=(8, 8))
plt.title('Correlation between numeric variables', fontsize=18)
sns.heatmap(cor, square=True, cmap="vlag", annot=True, linewidths=0.5)
plt.show()

热力图
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def XXX_Score(x):
plt.figure()
ax = plt.subplot(3, 1, 1)
fm_2018 = sm.OLS(df_2018['Score'], sm.add_constant(
df_2018[x])).fit()
plt.scatter(df_2018[x], df_2018['Score'])
plt.plot(df_2018[x], fm_2018.fittedvalues)
plt.title('2018-2020 '+x+' and Ladder score', fontsize=18)
ax = plt.subplot(3, 1, 2)
fm_2019 = sm.OLS(df_2019['Score'], sm.add_constant(
df_2019[x])).fit()
plt.scatter(df_2019[x], df_2019['Score'])
plt.plot(df_2019[x], fm_2019.fittedvalues)
ax = plt.subplot(3, 1, 3)
fm = sm.OLS(df_2020['Ladder score'], sm.add_constant(
df_2020[x])).fit()
plt.scatter(df_2020[x], df_2020['Ladder score'])
plt.plot(df_2020[x], fm.fittedvalues)

plt.show()

# GDP和幸福得分
def GDP_Score():
plt.figure()
ax = plt.subplot(3, 1, 1)
fm_2018 = sm.OLS(df_2018['Score'], sm.add_constant(
df_2018['GDP per capita'])).fit()
plt.scatter(df_2018['GDP per capita'], df_2018['Score'])
plt.plot(df_2018['GDP per capita'], fm_2018.fittedvalues)
plt.title('2018-2020 GDP per capita and Ladder score', fontsize=18)
ax = plt.subplot(3, 1, 2)
fm_2019 = sm.OLS(df_2019['Score'], sm.add_constant(
df_2019['GDP per capita'])).fit()
plt.scatter(df_2019['GDP per capita'], df_2019['Score'])
plt.plot(df_2019['GDP per capita'], fm_2019.fittedvalues)
ax = plt.subplot(3, 1, 3)
fm_2020 = sm.OLS(df_2020['Ladder score'], sm.add_constant(
df_2020['Logged GDP per capita'])).fit()
plt.scatter(df_2020['Logged GDP per capita'], df_2020['Ladder score'])
plt.plot(df_2020['Logged GDP per capita'], fm_2020.fittedvalues)

plt.show()

# Healthy和幸福得分
def Healthy_Score():
XXX_Score('Healthy life expectancy')

# Socialsupport和幸福得分
def Socialsupport_Score():
XXX_Score('Social support')

# Freedom和幸福得分
def Freedom_Score():
XXX_Score('Freedom to make life choices')

# Generosity和幸福得分
def Generosity_Score():
XXX_Score('Generosity')

# Corruption和幸福得分(出错,2018年数据有nan)
def Corruption_Score():
XXX_Score('Perceptions of corruption')

拟合1
拟合2
拟合3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# 幸福报告中的中国近三年对比
def All_China():
l0=['2018','2019','2020']

df_2018_china=df_2018.loc[df_2018['Country or region'] == 'China']
df_2019_china=df_2019.loc[df_2019['Country or region'] == 'China']
df_2020_china=df_2020.loc[df_2020['Country name'] == 'China']

la1=[df_2018['Score'].mean(),df_2019['Score'].mean(),df_2020['Ladder score'].mean()]
lc1=df_2018_china['Score'].tolist()+df_2019_china['Score'].tolist()+df_2020_china['Ladder score'].tolist()

la2=[df_2018['GDP per capita'].mean(),df_2019['GDP per capita'].mean(),df_2020['Logged GDP per capita'].mean()]
lc2=df_2018_china['GDP per capita'].tolist()+df_2019_china['GDP per capita'].tolist()+df_2020_china['Logged GDP per capita'].tolist()

la3=[df_2018['Social support'].mean(),df_2019['Social support'].mean(),df_2020['Social support'].mean()]
lc3=df_2018_china['Social support'].tolist()+df_2019_china['Social support'].tolist()+df_2020_china['Social support'].tolist()

la4=[df_2018['Healthy life expectancy'].mean(),df_2019['Healthy life expectancy'].mean(),df_2020['Healthy life expectancy'].mean()]
lc4=df_2018_china['Healthy life expectancy'].tolist()+df_2019_china['Healthy life expectancy'].tolist()+df_2020_china['Healthy life expectancy'].tolist()

la5=[df_2018['Freedom to make life choices'].mean(),df_2019['Freedom to make life choices'].mean(),df_2020['Freedom to make life choices'].mean()]
lc5=df_2018_china['Freedom to make life choices'].tolist()+df_2019_china['Freedom to make life choices'].tolist()+df_2020_china['Freedom to make life choices'].tolist()

la6=[df_2018['Generosity'].mean(),df_2019['Generosity'].mean(),df_2020['Generosity'].mean()]
lc6=df_2018_china['Generosity'].tolist()+df_2019_china['Generosity'].tolist()+df_2020_china['Generosity'].tolist()

la7=[df_2018['Perceptions of corruption'].mean(),df_2019['Perceptions of corruption'].mean(),df_2020['Perceptions of corruption'].mean()]
lc7=df_2018_china['Perceptions of corruption'].tolist()+df_2019_china['Perceptions of corruption'].tolist()+df_2020_china['Perceptions of corruption'].tolist()

bar=(
Bar()
.add_xaxis(l0)
.add_yaxis("中国幸福指数",lc1)
.add_yaxis("中国GDP指数",lc2)
.add_yaxis("中国Social指数",lc3)
.add_yaxis("中国Healthy指数",lc4)
.add_yaxis("中国Freedom指数",lc5)
.add_yaxis("中国Generosity指数",lc6)
# .add_yaxis("中国Corruption指数",lc7)
)
line=(
Line()
.add_xaxis(l0)
.add_yaxis("世界幸福指数",la1)
.add_yaxis("世界GDP指数",la2)
.add_yaxis("世界Social指数",la3)
.add_yaxis("世界Healthy指数",la4)
.add_yaxis("世界Freedom指数",la5)
.add_yaxis("世界Generosity指数",la6)
# .add_yaxis("世界Corruption指数",la7)
)
bar.overlap(line).render("2018-2020中国与世界平均水平对比.html")

调用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
print('main')


# print(df_2020)
# data_info()
# HappinessMap()
# country_score_of_every_region()
# regions_country_pie()
# every_regions_country_num()
# Top_and_Bottem_10_country_happiness()
# Heat_map()
# GDP_Score()
# Healthy_Score()
# Socialsupport_Score()
# Freedom_Score()
# Generosity_Score()

# 出错,2018数据有nan
# Corruption_Score()

All_China()

总结

上面的其实大部分都是参考过来的,自己也添加了一些内容,上面主要是代码和可视化图片,结论分析并没有展现。期间还用VSCode使用了Jupyter Notebook,参考来自VSCode插件 优雅地使用Jupyter Notebook

python也只学了一学期,期间还因为课程安排在周六,被会计考试、竞赛、教师资格证、四六级、研究生考试等各种原因,实际上只上了几周,本来这门课就是普通的考查课,老师也没有要求那么严格,但是学习Python还是挺有趣的。好像有数据表明Python在年底时超越了java,成为使用最多的语言,的确在某些方面Python与其它语言是有着不少的优势,近几年人工智能、机器学习大火,与之脱不开关系。