爬取京東商城商品資訊

2022-08-23 14:06:11 字數 4712 閱讀 8958

from selenium import webdriver

from selenium.webdriver import actionchains #獲取屬性

from selenium.webdriver.common.keys import keys

from selenium.webdriver.support.ui import webdriverwait

from selenium.webdriver.support import expected_conditions as ec

from selenium.webdriver.common.by import by

import pymongo

import csv

import time

from pyecharts.charts import bar

from pyecharts import options as opts

import pandas as pd

#連線mongdb資料庫

client=pymongo.mongoclient(host='localhost',port=27017)

db=client.jd

datalist=

def spider_data():

browser=webdriver.chrome()

url=''

browser.get(url)

browser.find_element_by_id('key').send_keys('8g記憶體條')

browser.find_element_by_id('key').send_keys(keys.enter)

#webdriverwait(browser,1000).until(ec.presence_of_element_located((by.class_name,'pn-next')))

count=0

while true:

try:

count+=1

# 顯示等待,直到所有商品資訊載入完成

webdriverwait(browser,1000).until(ec.presence_of_element_located((by.class_name,'gl-item')))

#滾動條下拉到最下面

browser.execute_script('document.documentelement.scrolltop=10000')

time.sleep(3)

browser.execute_script('document.documentelement.scrolltop=0')

lists = browser.find_elements_by_class_name('gl-item')

for li in lists:

name=li.find_element_by_xpath('.//div[@class="p-name p-name-type-2"]//em').text

price=li.find_element_by_xpath('.//div[@class="p-price"]//i').text

commit=li.find_element_by_xpath('.//div[@class="p-commit"]//a').text

shop_name=li.find_element_by_xpath('.//div[@class="p-shop"]//a').text

datas={}

datas['name']=name

datas['price']=price

datas['commit']=commit

datas['shop_name']=shop_name

#連線mongodb資料庫

collection=db.datas

collection.insert(datas)

print(datas)

except :

print('error')

if count==1:

break

next=browser.find_element_by_css_selector('a.pn-next')

next.click()

print("資料爬取完成")

#寫入資料

def write_data():

with open('e:/data_csv.csv','w',encoding='utf-8',newline='') as f:

try:

title=datalist[0].keys()

writer=csv.dictwriter(f,title)

writer.writeheader()

writer.writerows(datalist)

except:

print('error')

print('檔案寫入完成')

# 資料清洗

def clear_data():

data = pd.read_csv('e:\data_csv.csv')

# 刪除_id列

data.drop('_id', axis=1, inplace=true)

# 刪除'去看二手'行

data.drop(data[data['commit'].str.contains('去看二手')].index, inplace=true)

def convert_data(var):

# 將+,萬去除

new_value = var.replace('+', '').replace('萬', '')

return float(new_value)

# 寫入cvs檔案中

# 清除commit數大於100的行

data.drop(data[data['commit'] >= 100].index, inplace=true)

# 儲存為csv檔案

data.to_csv('e:\clear_data.csv')

def group_data():

# 資料清洗

data1 = pd.read_csv('e:\clear_data.csv')

# 刪除其它品牌,保留金士頓

data1.drop(data1[data1['name'].str.contains('十銓|宇瞻|光威|美商海盜船|威剛|芝奇|三星|金百達|英睿達|聯想|佰微')].index, inplace=true)

# 儲存為csv檔案

data1.to_csv('e:\kingston.csv')

data2 = pd.read_csv('e:\clear_data.csv')

# 篩選出威剛

data2.drop(data2[data2['name'].str.contains('金士頓|十銓|宇瞻|光威|美商海盜船|芝奇|三星|金百達|英睿達|聯想|佰微')].index, inplace=true)

# 儲存為csv檔案

data2.to_csv('e:\weigang.csv')

print('資料清洗完成')

#資料視覺化

def show_data():

data_path=pd.read_csv('e:\clear_data.csv')

bar=bar()

bar.add_xaxis(data_path['name'].tolist())

bar.add_yaxis('**',data_path['price'].tolist())

bar.set_global_opts(title_opts=opts.titleopts(title="商品**"))

bar.render('all_data.html')

data_path1 = pd.read_csv('e:\kingston.csv')

bar1 = bar()

bar1.add_xaxis(data_path1['name'].tolist())

bar1.add_yaxis('**', data_path1['price'].tolist())

bar1.set_global_opts(title_opts=opts.titleopts(title="商品**"))

bar1.render('kingston.html')

data_path2 = pd.read_csv('e:\weigang.csv')

bar2 = bar()

bar2.add_xaxis(data_path2['name'].tolist())

bar2.add_yaxis('**', data_path2['price'].tolist())

bar2.set_global_opts(title_opts=opts.titleopts(title="商品**"))

爬取京東商品資訊

爬取京東商品資訊 from selenium import webdriver from selenium.webdriver import chromeoptions from selenium.webdriver import actionchains from selenium.webdriv...

selenium 爬蟲爬取京東商城商品資訊

看完用selenium爬取 商品資訊的網課,於是乎想著自己也整乙個selenium程式來爬取京東以作鞏固。寫了幾個小時的 通過不斷除錯,學到了很多細節上的處理,完整 在下方,使用時修改搜尋的引數就可以開始爬取了,事先要安裝goole chrome的驅動。最終爬取的結果儲存在了products陣列中,...

python 爬取京東商品資訊

coding utf 8 import os import re import time from urllib.parse import urlencode import requests from lxml import etree import pymysql from time import...