python3 和 pip3 安裝
安裝 selenium
配置驅動的環境變數,或者將驅動放到已經配置好的資料夾中, 類似 window 的 cmd的目錄
window :c:\windows\system32 linux : /usr/bin; /usr/local/bin
安裝 pyquery
安裝pymysql
# -*- coding: utf-8 -*-
"""created on wed jul 24 12:07:25 2019
@author: icheng
"""from selenium.webdriver import firefox
from selenium.webdriver.firefox.options import options
from pyquery import pyquery as pq
import time
import pymysql
import threading
class spider:
def __init(self):
self.__data =
self.__options = options()
self.__options.add_argument('-headless')
self.__driver = firefox(options=self.__options)
print('init')
# 退出無頭瀏覽器
def __close(self):
self.__driver.quit()
print("close")
# 通過url解析頁面,能載入js
def pageparsing(self,url):
try:
self.__driver.get(url)
time.sleep(1)
html = self.__driver.page_source
except connectionerror:
print('connectionerror')
print('attempting to reconnect')
time_sleep = 0
while true:
time.sleep(time_sleep)
try:
self.__driver.get(url)
html = self.__driver.page_source
break
except connectionerror:
time_sleep = time_sleep * 2
if(time_sleep > 50):
print('pageparsing wrong exit')
break
return html
# 處理單個
def getinfo(self,url):
html = self.pageparsing(url)
doc = pq(html)
text = doc('.course-enroll-info_course-enroll_price-enroll_enroll-count').text()
trynumber = 0
while(text == '' and trynumber < 5):
html = self.pageparsing(url)
doc = pq(html)
text = doc('.course-enroll-info_course-enroll_price-enroll_enroll-count').text()
trynumber += 1
if(trynumber == 5):
print('geting',url.split('/')[-1])
return -1
number = int(text[2:-3]) #從 『已有###人參加』 字串中提取數字
return number
def getdata(self):
return self.__data
# 批量處理
def getbatchdata(self,urls):
self.__init()
for url in urls:
number = self.getinfo(url[0])
if(number == -1):
continue
self.__close()
# 主程序 分成多個程序來爬取 加快速度
def run(urls):
threadnumber = 10
num = len(urls)
count = num // threadnumber
threads = # 用來存放執行緒的列表
spiders =
for i in range(threadnumber): # 新建程序
spider = spider()
t = threading.thread(target=spider.getbatchdata, args=(urls[count * i:count * (i + 1)],))
time.sleep(1)
if(count * (threadnumber) < num):
spider = spider()
t = threading.thread(target=spider.getbatchdata, args=(urls[count * threadnumber:num],))
for t in threads: # 開啟所有執行緒
t.start()
for t in threads: # 阻塞主線程,直到所有執行緒全部完成
t.join()
data =
for s in spiders:
data += s.getdata()
return data
def update():
times = time.time()
# 連線資料庫 將資料庫配置更改為執行機器的資料庫資訊
db = pymysql.connect(host='localhost',user='root',password='123456',database='world')
cursor = db.cursor()
# 更改sql語句 只需要 id url
select = 'select id,url from c'
cursor.execute(select) # 執行語句
urls =
for i in cursor:
print('running...') # 爬取資料
data = run(urls)
data =
print('data acquisition success')
# 更改更新的sql語句
sql = 'update c set number=%s where id=%s'
cursor.executemany(sql,data) # 更新到資料庫
db.commit()
cursor.close()
db.close()
print('successful database update')
times2 = time.time()
print(times2-times)
if __name__ == '__main__':
update()
python 網路爬蟲 與資料庫
這是乙個簡單的爬取豆瓣電影top250的 爬去了每一條電影的18個維度的資料,並且將他們儲存在本地的mysql資料庫中.詳細 如下.requests 請求網頁,獲取網頁資料 lxml 使用xpath語法快速解析網頁資料 coding utf 8 created on tue jan 22 20 55...
python 爬蟲 xpath 儲存到資料庫
參考 安裝 lxml 庫 import pymysql import requests from lxml import etree def get movies page url page 獲取url中的內容 response requests.get url html content respo...
Python爬蟲之四倉庫(資料庫)
第三方庫名 sqlite3import sqlite3 建立資料庫連線物件 conn sqlite3.connect my data.db 建立資料庫操控物件 control conn.cursor 查詢 返回可迭代物件 info control.execute select from novel ...