#import pandas as pd#import numpy as np## #載入資料集#data_filename = "nba15_16_dataset/basketball.csv"#dataset = pd.read_csv(data_filename,encoding="utf-8")##清洗資料##1#dataset = pd.read_csv(data_filename,parse_dates=["date"])##2#dataset.columns = ["date", "start(et)","visitor team", "visitorpts", "home team", "homepts", "ot?", "score type","attend.", "notes"]##抽取新的特徵#dataset["homewin"] = dataset["visitorpts"] < dataset["homepts"]## dataset.head()#y_true = dataset["homewin"].values#dataset["homewin"].mean()##構造新屬性 需要**的兩隻球隊在各自的上場比賽中勝負情況#from collections import defaultdict
#won_last = defaultdict(int)#dataset["homelastwin"] = 0#dataset["visitorlastwin"] = 0#for index, row in dataset.iterrows():
#home_team = row["home team"]#visitor_team = row["visitor team"]#row["homelastwin"] = won_last[home_team]
#dataset.set_value(index, "homelastwin", won_last[home_team])#dataset.set_value(index, "visitorlastwin", won_last[visitor_team])#won_last[home_team] = int(row["homewin"])#won_last[visitor_team] = 1 - int(row["homewin"])
##決策樹進行**#from sklearn.tree import decisiontreeclassifier#from sklearn.cross_validation import cross_val_score#import numpy as np
#clf = decisiontreeclassifier(random_state=14)#x_previouswins = dataset[["homelastwin", "visitorlastwin"]].values
#scores = cross_val_score(clf, x_previouswins, y_true, scoring="accuracy")#print(scores)#print("accuracy: %".format(np.mean(scores) * 100))##新建特徵 排名#standings_filename = "nba15_16_dataset/standings.csv"#standings = pd.read_csv(standings_filename, skiprows=0, encoding="utf-8")#standings.head()#dataset["hometeamrankshigher"] = 0#for index, row in dataset.iterrows():#home_team = row["home team"]#visitor_team = row["visitor team"]#home_rank = standings[standings["team"] == home_team]["rk"].values[0]#visitor_rank = standings[standings["team"] == visitor_team]["rk"].values[0]#dataset.set_value(index, "hometeamrankshigher",int(home_rank < visitor_rank))#x_homehigher = dataset[["hometeamrankshigher","homelastwin", "visitorlastwin",]].values
#clf = decisiontreeclassifier(random_state=14)#scores = cross_val_score(clf, x_homehigher, y_true, scoring="accuracy")#print("accuracy: %".format(np.mean(scores) * 100))#dataset["hometeamrankshigher"] = 0#for index, row in dataset.iterrows():#home_team = row["home team"]#visitor_team = row["visitor team"]#home_rank = standings[standings["team"] == home_team]["rk"].values[0]#visitor_rank = standings[standings["team"] == visitor_team]["rk"].values[0]#dataset.set_value(index, "hometeamrankshigher",int(home_rank < visitor_rank))#x_homehigher = dataset[["hometeamrankshigher","homelastwin", "visitorlastwin",]].values
#clf = decisiontreeclassifier(random_state=14)#scores = cross_val_score(clf, x_homehigher, y_true, scoring="accuracy")#print("accuracy: %".format(np.mean(scores) * 100))
python資料探勘面試 位元組跳動資料探勘面試總結
1.資料探勘任務 使用者常住城市 2.資料倉儲任務 演出主藝人名標籤挖掘 3.mapreduce原理 4.mapreduce特別慢的原因 4.1 計算機效能 cpu 記憶體 磁碟健康 網路 4.2 i o操作優化 資料傾斜 map reduce數設定不合理 reduce等待過久 小檔案過多 大量的不...
python資料分析與挖掘實戰 資料探勘基礎
從大量資料 包括文字 中挖掘出隱含的 未知的 對決策有潛在價值的關係 模式和趨勢,並用這些知識和規則建立用於決策支援的模型,提高 性決策支援的方法 工具和過程,就是資料探勘 它是利用各種分析工具在大量資料中尋找其規律和發現模型與資料之間關係的過程,是統計學 資料庫技術和人工智慧技術的綜合。1.定義挖...
資料探勘技術對ERP的影響
進入90年代,隨著市場競爭的進一步加劇,企業競爭空間與範圍的進一步擴大,80年代mrp 主要面向企業內部資源全面計畫管理的思想逐步發展為90年代怎樣有效利用和管理整體資源的管理思想,erp enterprise resource planning 企業資源計畫也就隨之產生。與此同時,資料庫技術和人工...