""""""import pandas as pd
import urllib
import urllib2
from bs4 import beautifulsoup
import codecs
import re
a1 = 101500 #需要自己修改起始值
urlname_list =
url_name_start = u'/questionbank/5ymjvwgym6' #填入查詢到開始的urlname
url_name_end = u'/questionbank/g5mbgom1ax' #填入查詢到最後的urlname
a = 1
b = 1
while true:
url_name = "" + url_name_start
user_agent = "mozilla/5.0 (x11; u; linux x86_64; zh-cn; rv:1.9.2.10) gecko/2011122 ubuntu/10.10 (m**erick) firefox/2.5.1"
request = urllib2.request(url_name, headers=)
html = urllib2.urlopen(request)
html_data = beautifulsoup(html,"html.parser")
if html_data.find(name='a') is none:
urlname_list.pop()
url_name_start = urlname_list[-1]
continue
for m in html_data.find_all(href=re.compile("/questionbank/")) :
if m['href'] == url_name_end:
break
else:
a = a + 1
url_name_start = urlname_list[-1]
if url_name_end == url_name_start:
break
print u"查詢結果共" + str(a) + u"條"
print u"最終查詢結果共" + str(a) + u"條"
print u'開始爬取網頁'
#爬取網頁
import pandas as pd
import urllib
import urllib2
from bs4 import beautifulsoup
import codecs
import time
time_start=time.time()
"""修改題目對應網頁數值
"""a2 = a1
for i in urlname_list:
try:
url_name = "" + i
user_agent = "mozilla/5.0 (x11; u; linux x86_64; zh-cn; rv:1.9.2.10) gecko/2011122 ubuntu/10.10 (m**erick) firefox/2.5.1"
request = urllib2.request(url_name, headers=)
html = urllib2.urlopen(request)
f = codecs.open('html/sz_'+str(a1),'w')
f.write(html.read())
f.close()
a1 = a1 + 1
except:
print i
pass
continue
print "下次使用該編碼作為起始值:" + str((int(a1/100)+1)*100)
print "爬取網頁結束,開始處理文字"
# -*- coding: utf-8 -*-
def html_chuli(html):
html_data = beautifulsoup(html)
t_miaosu = html_data.find(attrs=)['content'] #題目描述
t_news_title = html_data.find_all(attrs=)
t_news_typs = html_data.find_all(attrs=)
t_news_time = html_data.find_all(attrs=)
tdata1 = html_data.find("div", attrs=)#抓取第乙個框架
if tdata1:
t_leixing = tdata1.select('span')[0].string #題目型別
t_content = tdata1.select('div.question-title')[0].string #題目內容 注:id是#;name是.
t_xueze = tdata1.select('div.question-item') #題目所有選項
x_abcd = #選項abcd
x_content = #選項abcd對應內容
z_xueze = #正確選項
for item in t_xueze:
item_middle = item.get_text().split()
for item in tdata1.select('label.actives'):#選擇
for item in tdata1.select('div.question-item.correct i'):#判斷
return t_miaosu,t_leixing,t_content,x_abcd,x_content,z_xueze,t_news_title,t_news_typs,t_news_time
else:
return '0'
#文字處理
import pandas as pd
import urllib
import urllib2
import re
import json
import random
from bs4 import beautifulsoup
import codecs
"""修改提取後對應文字編碼
"""for i in range(a2,a1):
try:
with open('html/sz_'+str(i), 'r') as f:
s_1 = ""
s_2 = ""
t_n = ""
contents = f.read().decode("utf-8", "ignore") #處理�
t_miaosu,t_leixing,t_content,x_abcd,x_content,z_xueze,t_news_title,t_news_typs,t_news_time = html_chuli(contents)
for m in range(len(x_abcd)):
if x_abcd[m][0]:
s1 = x_abcd[m][0]
else:
s1=""
if x_content[m][0]:
s2 = x_content[m][0]
else:
s2=""
s_1 = s_1 + s1 + ":" + s2 + " "
for n in range(len(z_xueze)):
s_2 = s_2 + z_xueze[n].strip()
for z in range(len(t_news_title)):
if t_news_title[z]:
new1 = t_news_title[z].text
else:
new1=""
if t_news_typs[z]:
new2 = t_news_typs[z].text
else:
new2=""
if t_news_time[z]:
new3 = t_news_time[z].text
else:
new3=""
t_n = t_n + new1 + "|" + new2 + "|" + new3 + "&"
if t_leixing is none:
continue
k1 = str(i) + "#" + t_miaosu.replace("\n", "") + "#" + t_leixing + "#" + t_content.replace(" ", "").replace("\n", "") + "#" + s_1.replace("\n", "") + "#" + s_2.replace("\n", "") + "#" + t_n.replace("\n", "")
f1 = codecs.open(u'out/時政202011-20210325.txt','a',encoding="utf-8") #修改匯出txt檔案編號
f1.write(k1 + "\n")
except:
f2 = codecs.open('out/fail_num.txt','a',encoding="utf-8")
k2 = str(i)
f2.write(k2 + "\n")
print str(i) + u"號html檔案匯入失敗!"
f2.close()
pass
continue
f1.close()
print u"處理完畢!再次執行請修改「輸出檔名」,並儲存py檔案,然後重新開始!!!"
此**僅紀念作用,目前已不可用
Python爬蟲 scrapy定時執行的指令碼
由於伺服器的crontab莫名掛掉了,還沒找到解決的辦法,於是找了另乙個方法 原理 1個程序 多個子程序 scrapy程序 將以下 檔案放入scrapy專案中任意位置即可 from multiprocessing import process from scrapy import cmdline i...
執行Windows PowerShell指令碼
執行windows powershell指令碼 預設情況下,為了確保安全性,windows powershell禁止執行指令碼。要執行自己建立的指令碼,必須更改windows powershell的執行策略,為此可使用下列命令 set executionpolicy remotesigned執行策略...
postgresql windows 備份指令碼
postgresql windows 備份指令碼 echo off setlocal enableextensions cd d dp0 set pgpath d program files x86 postgresql 9.3 bin pg dump set svpath g 0923 rmtjy...