資料分析 1880 2020全美嬰兒姓名

# -*- coding: utf-8 -*-
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
names1880 = pd.read_csv('names/yob1880.txt', names=[ 'name', '***', 'births' ])
#print names1880
#用birth列的six分組
#print names1880.groupby('***').births.sum()
#組裝資料並加上年限
years = range(1880, 2011)
pieces = 
columns = [ 'name', '***', 'births' ]
for year in years:
path = 'names/yob%d.txt' % year
frame = pd.read_csv(path, names=columns)
frame[ 'year' ] = year
#將所有資料整合到dataframe中
#ignore_index=true去掉原始行號
names = pd.concat( pieces, ignore_index=true)
#print names[5000:5010]
#聚合#.tail() 返回後幾行，預設等於5
total_births = names.pivot_table( 'births', rows='year', cols='***', aggfunc=sum )
#print total_births.tail()
#嬰兒佔總出生數的比例
def add_prop(group):
#整數除法向下圓整
births = group.births.astype(float)
group[ 'prop' ] = births / births.sum()
return group
#print names
#驗證總和是否為1
#print np.allclose(names.groupby( ['year', '***' ]).prop.sum(), 1)
#取出乙個子集：每對 ***/year 組合的前一千個名字
def get_top1000(group):
return group.sort_index(by='births', ascending=false)[:1000]
grouped = names.groupby( ['year', '***' ])
#print top1000
#分析命名趨勢
#前1000個分為男女兩部分
boys = top1000[ top1000.*** == 'm' ]
girls = top1000[ top1000.*** == 'f' ]
#生成一張按year和name統計的總出生透明表
total_births = top1000.pivot_table( 'births', rows='year', cols='name', aggfunc=sum )
#print total_births.tail()
#繪製曲線
subset = total_births[[ 'john', 'harry', 'mary', 'marilyn' ]]
#subset.plot(subplots=true, figsize=(12, 10), grid=false, title="number of births per year")
#評價命名多樣性的增長
#計算最流行的1000個名字所佔的比例
table = top1000.pivot_table( 'prop', rows='year', cols='***', aggfunc=sum )
#table.plot(title='sum of table1000.prop by year and ***', yticks=np.linspace(0, 1.2, 13), \
#xticks=range(1880, 2020, 10))
#計算佔總出生人數前50%的不同名字的數量
df = boys[ boys.year == 2010 ]
#print df
#cumsum累加值
prop_cumsum = df.sort_index( by='prop', ascending=false ).prop.cumsum()
#print prop_cumsum.searchsorted(0.5)
#我電腦.searchsorted報錯
df = boys[boys.year == 1900]
in1900 = df.sort_index(by='prop', ascending=false).prop.cumsum()
#print in1900
#in1900.searchsortedd(0.5) + 1
def get_quantile_count(group, q=0.5):
group = group.sort_index( by='prop', ascending=false)
return group.prop.cumsum().searchsorted(q) + 1
#diversity = diversity.unstack('***')
#print diversity.head()
#diversity.plot( title='numbwe of pupular names in top 50%' )
#最後乙個字母變革
#從name中取出最後乙個字母
get_last_letter = lambda x :x[-1]
#map()是element-wise的，對series中的每個資料呼叫一次函式。
last_letters =names.name.map(get_last_letter)
last_letters.name = 'last_letter'
table = names.pivot_table( 'births', rows=last_letters, cols=['***', 'year'], aggfunc=sum )
# print table
#選出代表性的三年，並輸出前面幾行
subtable = table.reindex( columns=[1910, 1960, 2010], level='year' )
#print subtable.head()
#按出生總數進行規範化處理
subtable.sum()
letter_prop = subtable / subtable.sum().astype(float)
#生成條形圖
#fig, axes = plt.subplots(2, 1, figsize=(10, 8))
#letter_prop['m'].plot ( kind='bar', rot=0, ax=axes[0], title='male' )
#letter_prop['f'].plot ( kind='bar', rot=0, ax=axes[1], title='female', legend=false )
#完整表進行規範化處理
letter_prop = table / table.sum().astype(float)
dny_ts = letter_prop.ix[[ 'd', 'n', 'y' ], 'm' ].t
#print dny_ts.head()
#繪製趨勢圖
#dny_ts.plot()
#變成女孩名字的男孩名字
#在top1000中找出「lesl「開頭的名字
all_names = top1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]
#print lesley_like
#分組並檢視頻率
filtered = top1000[top1000.name.isin(lesley_like)]
#print filtered.groupby('name').births.sum()
#按性別和年度聚合，並規範化處理
table = filtered.pivot_table( 'births', rows='year', cols='***', aggfunc='sum' )
table = table.div(table.sum(1), axis=0)
#print table.tail()
#年度曲線圖
table.plot(style=)

資料分析資料分析概述

了解業務了解資料確認業務和資料預期分析和管理資料分析方式01.了解資料資料 1.測量標度型別屬性本源並不是數字或者符號，通過測量標度將數值或者符號和物件的屬性建立關聯。屬性的型別測量尺度 nominal 標稱等於或者不等於一對一的變換 ordinal 序數大於或者小於單調函式的變...

資料分析資料分析的誤區

在資料分析的過程中，我們難免會走一些彎路，但有些彎路是可以避免的，下面我將介紹幾個資料分析過程中常見的誤區我們一定都聽說過二戰中的乙個經典示例軍方為了提高戰鬥機飛行員的生還率，打算在飛機上增加裝甲的厚度，但不能在所有部位加厚，這樣會喪失戰機的靈活性，於是軍方請了統計學家來研究，這些專家在一開始就...

「資料分析」崗位分析

行業內公司的融資情況從一定程度上說明了選擇資料分析崗位的穩定性佔比情況是所有職位累加在一起，未區分職位不需要融資的公司佔比60 d輪以上的公司42 其實很多不需要融資的公司，規模也是很大的，可以結合公司規模來判斷公司的情況如何，是否值得去發展。提供資料類職位的公司，規模還算比較大的 500人以上...

資料分析 1880 2020全美嬰兒姓名

資料分析 資料分析概述

資料分析 資料分析的誤區

「資料分析」崗位分析

相關推薦

資料分析資料分析概述

資料分析資料分析的誤區