Day66 数据分析之数据可视化

模拟数据分析真实项目流程
明确问题：明确数据分析的真实需求
理解数据：数据获取和数据探索
数据清洗：一个数据分析项目大部分时间花在了数据清洗上
数据分析和可视化：对清洗后的数据进行分析，并且通过可视化展示出结果
结论和建议：对结果进行解读，得出有价值的结论并且提出相关建议

matplotlib应用

–用pygal实现数据可视化之掷骰子

die=Die()
results=[]
for i in range(1000):#掷100次
    result=die.roll()
    results.append(result)

#print(results)
frequencies=[]   #数在1000次中每个面出现了几次
sides=die.num_sides
for value in range(1,sides+1):
    frequency=results.count(value)
    frequencies.append(frequency)

print(frequencies)

import pygal
graph=pygal.Bar()   #调用条形图
graph.title="Results of rolling one D6 1000 times"
graph.xlabels=['1','2','3','4','5','6']
graph.x_title="Result"
graph.y_title="Frequency of result"
graph.add("D6",frequencies)
graph.render_to_file("die.svg") #创立一个svg文件然后打开

#output:[164, 163, 175, 154, 173, 171]

同时掷两个面数相同的骰子、

from random import randint
class Die():

    def __init__(self,num_sides=6):
        """骰子默认为6面"""
        self.num_sides=num_sides

    def roll(self):
        """"返回一个位于1和骰子面数之间的随机值"""
        return randint(1,self.num_sides)


#掷两个面数相同的骰子
die_1=Die()
die_2=Die()
results=[]
for i in range(1000):#掷100次
    result=die_1.roll()+die_2.roll()
    results.append(result)

#print(results)
frequencies=[]   #数在1000次中每个面出现了几次
sides=die_1.num_sides+die_2.num_sides   #上限应该是2个数相加的所有值中的最大值12
for value in range(2,sides+1):
    frequency=results.count(value)
    frequencies.append(frequency)

print(frequencies)

import pygal
graph=pygal.Bar()   #调用条形图
graph.title="Results of rolling one D6 1000 times"
graph.xlabels=[2,3,4,5,6,7,8,9,10,11,12]   #2个数相加，最少是2，最多是12，X标签要根据结果变动
graph.x_title="Result"
graph.y_title="Frequency of result"
graph.add("2*D6",frequencies)
graph.render_to_file("die_1+die_2.svg") #创立一个svg文件然后打开

同时掷两个面数不同的骰子

–CSV文件格式

import csv
from matplotlib import pyplot as plt
from datetime import datetime

filename="sitka_weather_07-2018_simple.csv"
with open(filename) as f:
    reader=csv.reader(f)
    #reader处理文件中以逗号分隔的第一行数据，并将每项数据都作为一个元素存储在列表中。
    header_row=next(reader)
    #数next()，调用它并将阅读器对象传递给它时，它将返回文件中的下一行
    # print(header_row)

    # for index,column_header in enumerate(header_row):
    # print(index,column_header)

#提取存储在第2列的日期和第5列的最高气温
    dates,maxtems=[],[]
    for row in reader:
        date=datetime.strptime(row[2],"%Y-%m-%d")
        dates.append(date)

        maxtem=int(row[5])
        # 取出来的字符串列表 转化为数字 方便matplotlib读取
        maxtems.append(maxtem)

    print(maxtems,dates)

figure=plt.figure(dpi=128,figsize=(10,6))
plt.plot(dates,maxtems,c='red') #plot接收列表

plt.title("Daily max temperatures in 07-2018 in sitka",fontsize=25)
plt.xlabel('',fontsize=16)
figure.autofmt_xdate()  #绘制斜的日期标签，以免它们彼此重叠。
plt.ylabel("Temperature",fontsize=16)
plt.tick_params(axis="both",which='major',labelsize=16)

plt.show()

更改文件源，使用全年数据

在上图基础上再添加最低气温

import csv
from matplotlib import pyplot as plt
from datetime import datetime

filename="sitka_weather_2018_simple.csv"
"""全年最高温度与最低温度展示"""
with open(filename) as f:
    reader=csv.reader(f)
    #reader处理文件中以逗号分隔的第一行数据，并将每项数据都作为一个元素存储在列表中。
    header_row=next(reader)
    #数next()，调用它并将阅读器对象传递给它时，它将返回文件中的下一行
    #print(header_row)

    for index,column_header in enumerate(header_row):
        print(index,column_header)
    # 提取存储在第2列的日期和第5列的最高气温

    dates,maxtems,mintems=[],[],[]
    for row in reader:
        date=datetime.strptime(row[2],"%Y-%m-%d")
        dates.append(date)

        maxtem=int(row[5])
        # 取出来的字符串列表 转化为数字 方便matplotlib读取
        maxtems.append(maxtem)

        mintem=int(row[6])
        mintems.append(mintem)

    print(maxtems,mintems)



figure = plt.figure(dpi=128, figsize=(10, 6))
plt.plot(dates,maxtems,c='red') #plot接收列表
plt.plot(dates,mintems,c='blue')  #添加第二条折线，dates重复用
plt.title("Daily temperatures in 2018 in sitka",fontsize=26)
plt.xlabel('',fontsize=10)
figure.autofmt_xdate()  #绘制斜的日期标签，以免它们彼此重叠。
plt.ylabel("Temperature",fontsize=16)
plt.tick_params(axis="both",which='major',labelsize=16)
plt.show()

当csv文件中出现空值时：数据表中有空值时,用try-except-else执行

    for row in reader:
        try:
            date=datetime.strptime(row[2],"%Y-%m-%d")
            maxtem = int(row[5])
            mintem = int(row[6])
        except ValueError:
            print(date,"missing data")
        else:
            dates.append(date)
            # 取出来的字符串列表 转化为数字 方便matplotlib读取
            maxtems.append(maxtem)
            mintems.append(mintem)

输出为：

2018-02-18 00:00:00 missing data

修改X轴，Y轴坐标上下限：plt.axis()
修改X轴上下限：plt.xlim()
修改Y轴上下限：plt.ylim()

–JSON格式

国家–人口–年份组成的json格式文件，通过包里的pygal_maps_world中的国别码——国家对应关系组成的字典来进行查询

from pygal_maps_world import i18n
#i18n里的COUNTRIES字典包含的键和值分别为两个字母的国别码和国家名。

dict=i18n.COUNTRIES
for country_code in sorted(dict.keys()): #country_code存储键，也就是国别码
    print(country_code,dict[country_code])

import json
from pygal_maps_world import i18n


dict=i18n.COUNTRIES
def get_country_code(country_title):
    """根据指定的国家，返回pygal使用的两个字母的国别码"""
    for code, name in dict.items():
        # 遍历字典键，值
        if name == country_title:
            return code


filename = 'population_data.json'
with open(filename)as f:
    world_maps = json.load(f)
    # 函数json.load()将数据转换为Python能够处理的格式，这里是一个列表。

    for world_map in world_maps:
        """需要每个国家的名字口，国别码与人口数量"""
        if world_map['Year'] == '2010':
            # 需要2010年每个国家的人口数量
            country_name = world_map['Country Name']
            population = int(float(world_map['Value']))
            # print(country_name + ": " + str(population)) # 打印时人口int型要转化为str型

            code=get_country_code(country_name)
            #国别码中简称与国名一一对应，但是json中的世界人口就不一定了，需要检测。
            if code:
                print(country_name+":"+code+":"+str(population))
            else:
                print('Error--'+country_name)

#并非所有人口数量对应的都是国家，有些人口数量
# 对应的是地区（阿拉伯世界）和经济类群（所有收入水平）。其次，有些统计数据使用了不同的
# 完整国家名（如Yemen, Rep.，而不是Yemen）。

制作世界地图

import pygal
wm = pygal.maps.world.World()
wm.title = 'North, Central, and South America'
wm.add('North America', ['ca', 'mx', 'us'])
wm.add('Central America', ['bz', 'cr', 'gt', 'hn', 'ni', 'pa', 'sv'])
wm.add('South America', ['ar', 'bo', 'br', 'cl', 'co', 'ec', 'gf',
 'gy', 'pe', 'py', 'sr', 'uy', 've'])
wm.render_to_file('americas.svg')

绘制世界地

import json 
import pygal 
from world_map_json import get_country_code
from pygal.style import RotateStyle  #颜色样式
# 将数据加载到列表中


filename = 'population_data.json'
with open(filename)as f:
    world_maps = json.load(f)
    # 函数json.load()将数据转换为Python能够处理的格式，这里是一个列表。


# 创建一个国别码：人口的字典
cc_populations = { }
for pop_dict in world_maps: #列表循环
    if pop_dict['Year'] == '2010':
        country=pop_dict['Country Name']
        population=int(float(pop_dict['Value']))
        code=get_country_code(country)
        if code:
            cc_populations[code] = population

#根据人口数量将国家进行分组，以颜色深浅区分
cc_1,cc_2,cc_3={ },{ },{ }
for cc,population in cc_populations.items():
    if population>=1000000000:
        cc_1[cc]=population
    elif population<10000000:
        cc_3[cc]=population
    else:
        cc_2[cc]=population
# print(len(cc_1),len(cc_2),len(cc_3))

wm_style=RotateStyle("#447722")
wm = pygal.maps.world.World(style=wm_style)        #创建世界地图实例
wm.title = 'World Population in 2010, by Country'
wm.add('>1bn', cc_1)   #传递由国别码和人口建成的字典
wm.add('10m-1bn', cc_2)
wm.add('0-10m', cc_3)

wm.render_to_file('world_population.svg')

测试断言类：
assertEqual(a, b) 核实a == b
assertNotEqual(a, b) 核实a != b
assertTrue(x) 核实x为True
assertFalse(x) 核实x为False
assertIn(item, list) 核实item在list中
assertNotIn(item, list) 核实item不在list中

本文地址：https://blog.csdn.net/marpleaaa/article/details/110674572

黄山市民网：https://www.huangshanshimin.com/

Day66 数据分析 之数据可视化