import requests
from lxml import etree
import re
import pymysql
# 定義url
headers =
# 連線資料庫
db = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='123456', database='python', charset='utf8')
# 建立游標
cursor = db.cursor()
# 獲取三次分頁
for p in range(3):
url = ''
# 迴圈一次 頁碼加一
# print(p)
p+=1
url = url.format(p)
print(url)
# 傳送請求
req = requests.get(url,headers=headers)
# 強制定義編碼
req.encoding ='gb2312'
# 格式化資料
response = req.text
# print(response)
# with open('dianying.html','wb' ) as f:
# f.write(req.content)
# 獲取網頁的obj
html_obj = etree.html(response)
# 定位資料
html_list = html_obj.xpath('//div[@class="co_content8"]/ul/td/table/tr[2]/td[2]/b/a/@href')
# print(html_list)
# 迴圈列表 拿出每條資料
for i in html_list:
# print(i)
# 拼接url
url_b = ''.format(i)
# print(url)
# 傳送請求
req = requests.get(url_b, headers=headers)
# with open('dianying.html', 'wb') as f:
# f.write(response.content)
# 定義編碼
req.encoding = 'gb2312'
# 轉換格式
response = req.text
# 正則匹配
href =re.search(r'href="(.*)">, response).group(1)
title =re.search(r'(.*)
', response).group(1)
# print(href)
print(title)
sql = '''insert into dianying values (null,'{}','{}')'''.format(title,href)
cursor.execute(sql)
db.commit()
cursor.close()
db.close()
Python爬取貓眼電影
不多說,直接上 import requests import re import random import pymysql import time 連線資料庫 db pymysql.connect host localhost port 3306,user root passwd a db pyt...
Python爬取電影天堂資源
from urllib import request,parse from lxml import etree import requests,re url1 req1 request.request url1 response1 request.urlopen req1 html1 respons...
python爬取貓眼電影排行
完整的 如下在這裡 閒著沒事,把解析html中的正則方法改用了xpath與beautifulsoup,只能說各有各的優點吧。正則的話,提取資訊可以連貫,一次性提取出所有需要的資訊,當然前提是你的正則式子沒有寫錯,所以說正則寫起來相比xpath與beautifulsoup來說要複雜一下,提取出錯後,除...