from selenium import webdriver
from selenium.webdriver import actionchains #獲取屬性
from selenium.webdriver.common.keys import keys
from selenium.webdriver.support.ui import webdriverwait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import by
import pymongo
import csv
import time
from pyecharts.charts import bar
from pyecharts import options as opts
import pandas as pd
#連線mongdb資料庫
client=pymongo.mongoclient(host='localhost',port=27017)
db=client.jd
datalist=
def spider_data():
browser=webdriver.chrome()
url=''
browser.get(url)
browser.find_element_by_id('key').send_keys('8g記憶體條')
browser.find_element_by_id('key').send_keys(keys.enter)
#webdriverwait(browser,1000).until(ec.presence_of_element_located((by.class_name,'pn-next')))
count=0
while true:
try:
count+=1
# 顯示等待,直到所有商品資訊載入完成
webdriverwait(browser,1000).until(ec.presence_of_element_located((by.class_name,'gl-item')))
#滾動條下拉到最下面
browser.execute_script('document.documentelement.scrolltop=10000')
time.sleep(3)
browser.execute_script('document.documentelement.scrolltop=0')
lists = browser.find_elements_by_class_name('gl-item')
for li in lists:
name=li.find_element_by_xpath('.//div[@class="p-name p-name-type-2"]//em').text
price=li.find_element_by_xpath('.//div[@class="p-price"]//i').text
commit=li.find_element_by_xpath('.//div[@class="p-commit"]//a').text
shop_name=li.find_element_by_xpath('.//div[@class="p-shop"]//a').text
datas={}
datas['name']=name
datas['price']=price
datas['commit']=commit
datas['shop_name']=shop_name
#連線mongodb資料庫
collection=db.datas
collection.insert(datas)
print(datas)
except :
print('error')
if count==1:
break
next=browser.find_element_by_css_selector('a.pn-next')
next.click()
print("資料爬取完成")
#寫入資料
def write_data():
with open('e:/data_csv.csv','w',encoding='utf-8',newline='') as f:
try:
title=datalist[0].keys()
writer=csv.dictwriter(f,title)
writer.writeheader()
writer.writerows(datalist)
except:
print('error')
print('檔案寫入完成')
# 資料清洗
def clear_data():
data = pd.read_csv('e:\data_csv.csv')
# 刪除_id列
data.drop('_id', axis=1, inplace=true)
# 刪除'去看二手'行
data.drop(data[data['commit'].str.contains('去看二手')].index, inplace=true)
def convert_data(var):
# 將+,萬去除
new_value = var.replace('+', '').replace('萬', '')
return float(new_value)
# 寫入cvs檔案中
# 清除commit數大於100的行
data.drop(data[data['commit'] >= 100].index, inplace=true)
# 儲存為csv檔案
data.to_csv('e:\clear_data.csv')
def group_data():
# 資料清洗
data1 = pd.read_csv('e:\clear_data.csv')
# 刪除其它品牌,保留金士頓
data1.drop(data1[data1['name'].str.contains('十銓|宇瞻|光威|美商海盜船|威剛|芝奇|三星|金百達|英睿達|聯想|佰微')].index, inplace=true)
# 儲存為csv檔案
data1.to_csv('e:\kingston.csv')
data2 = pd.read_csv('e:\clear_data.csv')
# 篩選出威剛
data2.drop(data2[data2['name'].str.contains('金士頓|十銓|宇瞻|光威|美商海盜船|芝奇|三星|金百達|英睿達|聯想|佰微')].index, inplace=true)
# 儲存為csv檔案
data2.to_csv('e:\weigang.csv')
print('資料清洗完成')
#資料視覺化
def show_data():
data_path=pd.read_csv('e:\clear_data.csv')
bar=bar()
bar.add_xaxis(data_path['name'].tolist())
bar.add_yaxis('**',data_path['price'].tolist())
bar.set_global_opts(title_opts=opts.titleopts(title="商品**"))
bar.render('all_data.html')
data_path1 = pd.read_csv('e:\kingston.csv')
bar1 = bar()
bar1.add_xaxis(data_path1['name'].tolist())
bar1.add_yaxis('**', data_path1['price'].tolist())
bar1.set_global_opts(title_opts=opts.titleopts(title="商品**"))
bar1.render('kingston.html')
data_path2 = pd.read_csv('e:\weigang.csv')
bar2 = bar()
bar2.add_xaxis(data_path2['name'].tolist())
bar2.add_yaxis('**', data_path2['price'].tolist())
bar2.set_global_opts(title_opts=opts.titleopts(title="商品**"))
爬取京東商品資訊
爬取京東商品資訊 from selenium import webdriver from selenium.webdriver import chromeoptions from selenium.webdriver import actionchains from selenium.webdriv...
selenium 爬蟲爬取京東商城商品資訊
看完用selenium爬取 商品資訊的網課,於是乎想著自己也整乙個selenium程式來爬取京東以作鞏固。寫了幾個小時的 通過不斷除錯,學到了很多細節上的處理,完整 在下方,使用時修改搜尋的引數就可以開始爬取了,事先要安裝goole chrome的驅動。最終爬取的結果儲存在了products陣列中,...
python 爬取京東商品資訊
coding utf 8 import os import re import time from urllib.parse import urlencode import requests from lxml import etree import pymysql from time import...