# coding=utf-8
import sys
import urllib2
import datetime
import random
from bs4 import beautifulsoup
reload(sys)
sys.setdefaultencoding('utf8')
links = # 鏈結
nameslist = # 知乎姓名
locationlist = # 位址
educationlist = # 學校
genderlist = # 性別 男1 女0
agreelist = # 獲得贊同數
thankslist = # 獲得感謝數
askslist = # 提問數
answerslist = # 回答數
postslist = # 文章數
collectionslist = # 收藏數
logslist = # 公共編輯數
urladdress = ''
while true:
try:
html = urllib2.urlopen(urladdress)
except:
randt = random.randint(0, len(links) - 1)
urladdress = links[randt]
html = urllib2.urlopen(urladdress)
bsobj = beautifulsoup(html, 'html.parser')
users = bsobj.findall("", )
first_name = bsobj.find('title')
n = first_name.get_text().split(' - ')[0]
location = bsobj.find("", )
l = (location.attrs['title'] if location != none else "未知")
education = bsobj.find("", )
e = (education.attrs['title'] if education != none else "未知")
gender = bsobj.find("", )
g = (gender.attrs['value'] if gender != none else "未知")
agree = bsobj.find("", ).find('strong')
a = agree.get_text()
thanks = bsobj.find("", ).find('strong')
t = thanks.get_text()
info = bsobj.find("div", ).findall('span')
ask = info[1].get_text()
ans = info[2].get_text()
post = info[3].get_text()
colle = info[4].get_text()
log = info[5].get_text()
filetxt = open('file.txt', 'w+')
if n not in nameslist:
filetxt.write(n + ',')
filetxt.write(l + ',')
filetxt.write(e + ',')
filetxt.write(g + ',')
filetxt.write(a + ',')
filetxt.write(t + ',')
filetxt.write(ask + ',')
filetxt.write(ans + ',')
filetxt.write(post + ',')
filetxt.write(colle + ',')
filetxt.write(log + '\n')
print n, '|', l, '|', e, '|', g, '|', a, '|', t, '|', ask, '|', ans, '|', post, '|', colle, '|', log
for user in users:
if user.attrs['href'].startswith('/people/'):
e = '' + user.attrs['href']
if e not in links:
random.seed(datetime.datetime.now())
randt = random.randint(0, len(links) - 1)
print links[randt]
urladdress = links[randt]
今天解決了搜尋的使用者重複出現的問題,並且獲得了遍歷使用者的一系列個人資訊,明天把這些資訊採集下來可以先用r語言進行簡單的分析,主要是熟悉r語言 爬蟲的第三天
聚焦爬蟲 爬取頁面中指定的頁面內容 正則解析爬取專案實戰 import requests import re import os if name main if not os.path.exists qiutulibs os.mkdir qiutulibs 設定乙個通用的url模板 url head...
15天學會爬蟲 第三天
a b if b else c 如果b為真,a b,否則a c if b a b else a c 將cookie字串直接放入headers中 準備cookie字典,在requests請求實傳入cookies引數中 cookies dict requests.get url,headers head...
2018 1 9 爬蟲學習第三天
encoding utf 8 from bs4 import beautifulsoup html 職位名稱 職位類別 人數地點 發布時間 22989 金融雲區塊鏈高階研發工程師 深圳 技術類1 深圳2017 11 25 22989 金融雲高階後台開發 技術類2 深圳2017 11 25 技術類2 ...