import json
from selenium import webdriver
from time import sleep
from bs4 import beautifulsoup
import urllib.parse
class weispider(object):
def __init__(self,url,shang,start_page,end_page):def main():self.url = url
self.shang = shang
self.start = start_page
self.end = end_page
def handle_url(self,page):
data =
url = urllib.parse.urlencode(data)
url = self.url +url
return url
#頁面def handle_page(self,url):
driver = webdriver.chrome()
driver.get(url)
for i in range(10):
js = "document.body.scrolltop += '1000'"
driver.execute_script(js)
sleep(2)
return driver.page_source
def download(self,res):
soup = beautifulsoup(res,'lxml')
wei_list = soup.select('.goods-inner')
print(wei_list)
items =
for wei in wei_list:
item = {}
title = wei.select('.goods-title-info')[0].get_text()
nowprice = wei.select('.goods-sells-price')[0].get_text()
zhekou = wei.select('.good-title-pms')[0].get_text()
oldprice = wei.select('.goods-market-price')[0].get_text()
discount = wei.select('.goods-price-info')[0].get_text()
img = wei.select('.goods-image-link img')[0]['src']
item['title'] = title
item['nowprice'] = nowprice
item['zhekou'] = zhekou
item['oldprice'] = oldprice
item['discount'] = discount
item['img'] = img
return items
#爬取def weipinspider(self):
infos =
for page in range(self.start,self.end+1):
url = self.handle_url(page)
res = self.handle_page(url)
info = self.download(res)
infos += info
jsonfile = json.dumps(infos,ensure_ascii=false)
with open('vip.json', 'w', encoding='utf-8')as fp:
fp.write(jsonfile)
url = ""shang = input("請輸入你要爬取的商品名")
start_page = int(input("爬取的起始頁"))
end_page = int(input("爬取的結束頁"))
spider = weispider(url=url,shang=shang,start_page=start_page,end_page=end_page)
spider.weipinspider()
if name == "main":
main()chrome是谷歌瀏覽器驅動【有介面的瀏覽器驅動】選擇chrome是注意是否與本機的谷歌瀏覽器匹配
phantomjs是乙個無介面的瀏覽器【執行較快】但是該產品已經不在更新
爬取唯品會首頁商品分類及鏈結
很久之前做的東西。一直沒時間分享,今天有空正好分享出來。想做個爬取唯品會首頁的商品分類和鏈結的指令碼。第一反應是用beautifulsoup。但是在瀏覽器裡除錯了很久沒有發現鏈結,無奈只能放棄了使用beautifulsoup。嘗試了抓包,我們看看下面是抓到的包 我們可以很清楚的看到,兩個介面除了id...
2013 3 28 唯品會筆試題
唯品會筆試題 1.const關鍵字作用是什麼?static 關鍵字作用是什麼?2.先進先出 後進先出的典型資料結構分別是什麼?用你熟悉的語言編寫其中乙個資料結構。3.任選一題 1 協議狀態碼 200 302 404 500分別代表什麼?2 tcp協議和 udp協議的區別是什麼?各舉乙個典型的應用場景...
selenium 爬取拉勾
用到的知識點 用selenium爬取拉勾 from lxml import etree from selenium import webdriver from selenium.webdriver.support import expected conditions as ec from selen...