1.用requests+beautifulsoup抓取糗事百科的文字內容;
2.將抓取的內容寫入txt。
1.獲取網頁源**
def get_html(url): #2.檢視源**結構找到要抓取的目標用requests庫得到網頁源**
html =requests.get(url).text
return html
3.找到這幾樣就可以寫抓取**如下
soup = beautifulsoup(html,'4.全部**肉如下lxml')
datas = soup.find(id="
content-left
")#獲取全部內容標籤
data_list = datas.find_all(class_="
article")
for data in
data_list:
contents = data.find(class_="
content
").text.replace('
\n','')#
獲取內容
name = data.find('
h2').text.replace('
\n','')#
獲取暱稱
age_gender = data.find(class_="
articlegender
")#獲取性別
if age_gender is
notnone:
cll = age_gender['
class']
if'womenicon'in
cll:
gender = '女'
elif
'manicon'in
cll:
gender = '男'
else
: gender = ''
age =age_gender.string
else
: gender = ''
age = ''
votes = data.find(class_="
stats-vote
").find(class_="
number
").text#
獲取點讚數
comments = data.find(class_="
stats-comments
").find(class_="
number
").text#
import5.謝謝**requests
from bs4 import
beautifulsoup
def get_html(url): #
用requests庫得到網頁源**
html =requests.get(url).text
return
html
defget_data(html):
soup = beautifulsoup(html,'
lxml')
datas = soup.find(id="
content-left
")#獲取全部內容標籤
data_list = datas.find_all(class_="
article")
for data in
data_list:
contents = data.find(class_="
content
").text.replace('
\n','')#
獲取內容
name = data.find('
h2').text.replace('
\n','')#
獲取暱稱
age_gender = data.find(class_="
articlegender
")#獲取性別
if age_gender is
notnone:
cll = age_gender['
class']
if'womenicon'in
cll:
gender = '女'
elif
'manicon'in
cll:
gender = '男'
else
: gender = ''
age =age_gender.string
else
: gender = ''
age = ''
votes = data.find(class_="
stats-vote
").find(class_="
number
").text#
獲取點讚數
comments = data.find(class_="
stats-comments
").find(class_="
number
").text#
dict =
yield
dict
defget_txt(dict):
print('
--'+'
正在寫入......')
with open(
'糗事百科.txt
','a+
',encoding='
utf-8
') as f:
for i in
dict:
f.write(str(i)+'\n'
)
print('
---'+'
寫入完畢')
defmain():
for i in range(1,20):
print('
正在爬取第%d頁
',i)
url = '
'.format(i)
html =get_html(url)
dict =get_data(html)
get_txt(dict)
if__name__ == '
__main__':
main()
多執行緒爬去糗事百科
import queue import threading from fake useragent import useragent import time import requests from requests.exceptions import requestexception from l...
python 爬取糗事百科
step 1 構建乙個提取糗事百科笑話的函式import urllib2 import urllib import re import thread import time import sys reload sys sys.setdefaultencoding utf 8 defgetpage p...
Python爬取糗事百科
一 引入模組 因為urlopen功能比較簡單,所以設定 ip需引入proxyhandler和build opener模組,ip的獲取可以上西祠 查詢 import re from urllib.request import request,build opener,proxyhandler base...