1 爬去資料**
#coding=utf-8from selenium import webdriver
from selenium.webdriver.common.by import by
from selenium.webdriver.support.ui import webdriverwait
from selenium.webdriver.support import expected_conditions as ec
#載入timeoutexception模組,用於進行超時處理
from selenium.common.exceptions import timeoutexception
#正規表示式
import re,sys
from pyquery import pyquery as pq
from config import *
#載入資料庫操作模組
import mysqlop
driver=webdriver.chrome()
#使用phantomjs瀏覽器驅動
#driver=webdriver.phantomjs()
driver.get("")
driver.set_window_size(1400,900)
wait=webdriverwait(driver, 10)
def search():
try:
input=wait.until(ec.presence_of_element_located(by.css_selector,"#q"))
submit=wait.until(ec.element_to_be_clickable((by.css_selector,"#j_tsearchform > div.search-button > button")))
input.clear()
input.send_keys("美食")
submit.click()
#獲取第一頁的資料
get_goods()
except timeoutexception :
search()
#獲取總頁碼
def get_total():
#查詢總頁碼
total=wait.until(ec.presence_of_element_located((by.css_selector,"#mainsrp-pager > div > div > div > div.total")))
return total.text
#翻頁def next_page(page):
try:
input=wait.until(ec.presence_of_element_located((by.css_selector, "#mainsrp-pager > div > div > div > div.form > input")))
submit=wait.until(ec.element_to_be_clickable((by.css_selector,"#mainsrp-pager > div > div > div > div.form > span.btn.j_submit")))
input.clear()
input.send_keys(page)
submit.click()
wait.until(ec.text_to_be_present_in_element((by.css_selector,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page)))
#獲取當前頁的資料
count=get_goods()
except timeoutexception:
next_page(page)
return count
def get_goods():
wait.until(ec.presence_of_element_located((by.css_selector,"#mainsrp-itemlist .items .item")))
#mainsrp-itemlist > div > div > div:nth-child(1) > div.item.j_mouseronverreq.item-ad
#mainsrp-itemlist > div > div > div:nth-child(1)
html=driver.page_source
doc=pq(html)
items=doc("#mainsrp-itemlist .items .item").items()
count=0
for item in items:
goods=
print(goods)
#將資料插入資料庫
mysqlop.mysqlop(goods)
count+=1
return count
def main():
search()
total=get_total()
#使用正規表示式提取頁碼
total=int(re.compile(r"(\d+)").search(total).group(1))
print(total)
total_count=0
for i in range(2,total+1):
count=next_page(i)
total_count +=count
print(total_count)
if __name__=="__main__":
main()
2 存入到mysql中
建立乙個mysqlop.py的檔案
#coding=utf-8from pymysql import *
def mysqlop(goods):
conn=connect(host='127.0.0.1', port=3306, user='root', passwd='1qaz2wsx#edc', db='taobao_meishi', charset='utf8')
cursor=conn.cursor()
cursor.execute("insert into goods(image,price,deal,title,shop,location) values(%s,%s,%s,%s,%s,%s)",(goods['image'],goods['price'],goods['deal'],goods['title'],goods['shop'],goods['location']))
conn.commit()
cursor.close()
conn.close()
使用selenium爬拉勾網資料
usr bin env python encoding utf 8 description 使用selenium爬拉勾網資料 from selenium import webdriver from selenium.webdriver.support.ui import webdriverwait ...
selenium 爬取拉勾
用到的知識點 用selenium爬取拉勾 from lxml import etree from selenium import webdriver from selenium.webdriver.support import expected conditions as ec from selen...
selenium爬取拉勾
用到的知識點 用selenium爬取拉勾 from lxml import etree from selenium import webdriver from selenium.webdriver.support import expected conditions as ec from selen...