爬蟲小試第三天

# coding=utf-8
import sys
import urllib2
import datetime
import random
from bs4 import beautifulsoup
reload(sys)
sys.setdefaultencoding('utf8')
links =   # 鏈結
nameslist =   # 知乎姓名
locationlist =   # 位址
educationlist =   # 學校
genderlist =   # 性別 男1 女0
agreelist =   # 獲得贊同數
thankslist =   # 獲得感謝數
askslist =   # 提問數
answerslist =   # 回答數
postslist =   # 文章數
collectionslist =   # 收藏數
logslist =   # 公共編輯數
urladdress = ''
while true:
try:
html = urllib2.urlopen(urladdress)
except:
randt = random.randint(0, len(links) - 1)
urladdress = links[randt]
html = urllib2.urlopen(urladdress)
bsobj = beautifulsoup(html, 'html.parser')
users = bsobj.findall("", )
first_name = bsobj.find('title')
n = first_name.get_text().split(' - ')[0]
location = bsobj.find("", )
l = (location.attrs['title'] if location != none else "未知")
education = bsobj.find("", )
e = (education.attrs['title'] if education != none else "未知")
gender = bsobj.find("", )
g = (gender.attrs['value'] if gender != none else "未知")
agree = bsobj.find("", ).find('strong')
a = agree.get_text()
thanks = bsobj.find("", ).find('strong')
t = thanks.get_text()
info = bsobj.find("div", ).findall('span')
ask = info[1].get_text()
ans = info[2].get_text()
post = info[3].get_text()
colle = info[4].get_text()
log = info[5].get_text()
filetxt = open('file.txt', 'w+')
if n not in nameslist:
filetxt.write(n + ',')
filetxt.write(l + ',')
filetxt.write(e + ',')
filetxt.write(g + ',')
filetxt.write(a + ',')
filetxt.write(t + ',')
filetxt.write(ask + ',')
filetxt.write(ans + ',')
filetxt.write(post + ',')
filetxt.write(colle + ',')
filetxt.write(log + '\n')
print n, '|', l, '|', e, '|', g, '|', a, '|', t, '|', ask, '|', ans, '|', post, '|', colle, '|', log
for user in users:
if user.attrs['href'].startswith('/people/'):
e = '' + user.attrs['href']
if e not in links:
random.seed(datetime.datetime.now())
randt = random.randint(0, len(links) - 1)
print links[randt]
urladdress = links[randt]

今天解決了搜尋的使用者重複出現的問題，並且獲得了遍歷使用者的一系列個人資訊，明天把這些資訊採集下來可以先用r語言進行簡單的分析，主要是熟悉r語言

爬蟲的第三天

聚焦爬蟲爬取頁面中指定的頁面內容正則解析爬取專案實戰 import requests import re import os if name main if not os.path.exists qiutulibs os.mkdir qiutulibs 設定乙個通用的url模板 url head...

15天學會爬蟲第三天

a b if b else c 如果b為真，a b，否則a c if b a b else a c 將cookie字串直接放入headers中準備cookie字典，在requests請求實傳入cookies引數中 cookies dict requests.get url,headers head...

2018 1 9 爬蟲學習第三天

encoding utf 8 from bs4 import beautifulsoup html 職位名稱職位類別人數地點發布時間 22989 金融雲區塊鏈高階研發工程師深圳技術類1 深圳2017 11 25 22989 金融雲高階後台開發技術類2 深圳2017 11 25 技術類2 ...

爬蟲小試第三天

爬蟲的第三天

15天學會爬蟲 第三天

2018 1 9 爬蟲學習第三天

相關推薦

15天學會爬蟲第三天