by:小?
post請求
cookies
session
beautifulsoup庫
scrapy框架
import requests
response = requests.get(
"")# 或者response = requests.request("get", "")
import requests
kw =
headers =
# params 接收乙個字典或者字串的查詢引數,字典型別自動轉換為url編碼,
response = requests.get(
"?", params = kw, headers = headers)
# 檢視響應內容,response.text 返回的是unicode格式的資料
print
(response.text)
# 檢視響應內容,response.content返回的位元組流資料
print
(respones.content)
# 檢視完整url位址
print
(response.url)
# 檢視響應頭部字元編碼
print
(response.encoding)
# 檢視響應碼
print
(response.status_code)
?wd=%e5%8c%97%e4%ba%ac
iso-8859-1
200
import requests
formdata =
url =
""headers=
response = requests.post(url, data = formdata, headers = headers)
print
(response.text)
import requests
# 如果**需要使用http basic auth,可以使用下面這種格式:
proxy =
response = requests.post(
"", proxies = proxy)
print
(response.text)
the dormouse's story
once upon a time there were three little sisters; and their names were
elsie,
lacie,
tillie;
and they lived at the bottom of a well.
..."""
from bs4 import beautifulsoup
soup = beautifulsoup(html_doc)
print
(soup.prettify(
))
the dormouse's story
once upon a time there were three little sisters; and their names were
elsie
,lacie
,tillie
;and they lived at the bottom of a well.
...
print
(soup.title)
# the dormouse's story
print
(soup.title.name)
# u'title'
print
(soup.title.string)
# u'the dormouse's story'
print
(soup.title.parent.name)
# u'head'
print(""
)print
(soup.p)
#the dormouse's story
print
(soup.p[
'class'])
# u'title'
print
(soup.p.string)
print(""
)print
(soup.a)
# elsie
print
(soup.find_all(
'a')
)# [elsie,
# lacie,
# tillie]
print
(soup.find(id=
"link3"))
# tillie
the dormouse's story
title
the dormouse's story
head
the dormouse's story
['title']
the dormouse's story
elsie
[elsie, lacie, tillie]
tillie
for link in soup.find_all(
'a')
:print
(link.get(
'href'
))
print
(soup.get_text(
))
the dormouse's story
the dormouse's story
once upon a time there were three little sisters; and their names were
elsie,
lacie,
tillie;
and they lived at the bottom of a well.
...
beautifulsoup爬取部落格例項
#開啟終端進入想建立的目錄輸入
scrapy startproject 專案名字
#例如
import scrapy
class
dmozitem
(scrapy.item)
: title = scrapy.field(
)#爬取名字
link = scrapy.field(
)#爬取鏈結
desc = scrapy.field(
)#爬取描述
#例如
import scrapy
from tutorial.items import dmozitem #匯入items裡定義的函式
class
dmozspider
(scrapy.spider)
: name =
"dmoz"
#給爬蟲起個名字
allowed_domains =
["dmoz.org"
]#爬蟲鏈結的名頭
start_urls =
["",#爬蟲開始的鏈結url
""]#推薦乙個chrome拓展xpath helper(非常實用)
defparse
(self, response)
:for sel in response.xpath(
'//ul/li'):
#爬取所有此標籤
item = dmozitem(
)#給items定義的函式起個使用的名
item[
'title'
]= sel.xpath(
'a/text()'
).extract(
)#爬取a標籤的名字
item[
'link'
]= sel.xpath(
'a/@href'
).extract(
)#爬取a標籤的鏈結
item[
'desc'
]= sel.xpath(
'text()'
).extract(
)#爬取文字
yield item #返回值給item
scrapy框架爬取部落格例項
python爬蟲基礎
一 什麼是爬蟲 通常爬蟲是從某個 的某個頁面開始,爬取這個頁面的內容,找到網頁中的其他鏈結位址,然後從這個位址爬到下乙個頁面,這樣一直不停的爬下去,進去批量的抓取資訊。那麼,我們可以看出網路爬蟲就是乙個不停爬取網頁抓取資訊的程式。二 爬蟲的基本流程 1,發起請求 向目標站點傳送乙個requests請...
python爬蟲基礎
爬蟲 爬蟲,全稱網路爬蟲,指按照一定的規則 模擬瀏覽器人工登入網頁的方式 自動抓取網路資訊資料的程式。簡單的說,就是將瀏覽器上網所能看到頁面上的內容通過爬蟲程式自動獲取下來,並進行儲存。爬蟲其實就是乙個程式自動收集獲取指定網路資料資訊的過程,網路資料資訊量十分龐大,人工獲取無法完成,這時就需要爬蟲來...
python 爬蟲基礎
urllib 或 requests re 01 r 大圖的 pat re.compile re 01 建立乙個正規表示式的模板 imgurls re.findall pat,data 開始匹配 print len imgurls imgurls i 0 for imgurl in imgurls i...