蓋得化工最新採集2

需要切換ip，採集過多會被**

#存放所有二級**

filename='combinedfile.csv'

#存放二級**目錄

site_hubei=""

pages_hubei=31

def get_sites(site,pages):

list_pages=

for page in range(1,pages+1):

thepage=site+"-"+"p"+str(page)+".html"

return list_pages

def get_company_name(elems,i):

elems_company_name=elems[i].select(".dblue")

if len(elems_company_name)==0: #如果找不到元素，則空起

company_name=""

return company_name

company_name=elems_company_name[0].text

return company_name

def get_main_product(elems,i):

elems_main_product=elems[i].select("li")

if len(elems_main_product)==0: #如果找不到元素，則空起

main_product=""

return main_product

main_product=elems_main_product[1].text.strip("\r\n")

return main_product

def get_phone_address(elems,i):

elems_contact=elems[i].select(".site_l")

content_contact=elems_contact[0].text

content_contact1=content_contact.strip("\r\n\r\n\t\r\n")

content_contact2=content_contact1.strip("\r\n")

list_content_contact=content_contact2.split("\r\n\r\n")

#有時候資訊會缺失，用正規表示式篩選text內容

if len(list_content_contact)==2:

phone=list_content_contact[0]

address=list_content_contact[1]

if len(list_content_contact)==1:

content=list_content_contact[0]

if "位址" in content:

address=content

phone=""

if "**" in content:

phone=content

address=""

phone_address=(phone,address)

return phone_address

#獲取每一頁20個公司資訊儲存在list_rows_information

def get_page_information(elems):

#每一頁20個公司資訊儲存在list_rows_information裡面

list_rows_information=

num=len(elems)

for i in range(num):

try:

#公司名稱

company_name=get_company_name(elems,i)

#主要產品

main_product=get_main_product(elems,i)

#****

phone_address=get_phone_address(elems,i)

phone=phone_address[0]

address=phone_address[1]

except:

print("error at:",i)

continue

return list_rows_information

#把一頁內容寫入csv文件 ,list_tablecontent為二維列表[[a],[b],[c]]

def write_table_to_csv(filename,list_tablecontent):

#對列**式修改，字串寫入的格式不對

file=open(filename,'w',newline='')

writer1=csv.writer(file)

writer1.writerows(list_tablecontent)

file.close()

#寫入所有檔案

def write_alltables_to_csvs(list_pages):

for i in range(pages_hubei):

try:

res=requests.get(list_pages[i])

soup=bs4.beautifulsoup(res.text,"lxml")

#綜合資訊

elems=soup.select(".clist_list_content_r")

#獲取每一頁20個公司資訊儲存在list_rows_information

list_rows_information=get_page_information(elems)

filename=str(i+1)+".csv"

write_table_to_csv(filename,list_rows_information)

time.sleep(random.randint(10,15))

except:

print("error at:",i)

continue

#主函式

#獲取32頁主要**

list_pages=get_sites(site_hubei,pages_hubei)

#生產所有csv檔案

write_alltables_to_csvs(list_pages)

'''測試

i=3res=requests.get(list_pages[i])

soup=bs4.beautifulsoup(res.text,"lxml")

elems=soup.select(".clist_list_content_r")

#****

elems_contact=elems[2].select(".site_l")

content_contact=elems_contact[0].text

content_contact1=content_contact.strip("\r\n\r\n\t\r\n")

content_contact2=content_contact1.strip("\r\n")

list_content_contact=content_contact2.split("\r\n\r\n")

#有時候資訊會缺失，用正規表示式篩選text內容

if len(list_content_contact)==2:

phone=list_content_contact[0]

address=list_content_contact[1]

if len(list_content_contact)==1:

content=list_content_contact[0]

if "位址" in content:

address=content

phone=

if "**" in content:

phone=content

address=

'''

蓋得化工網翻頁測試頁碼框輸入頁碼

關鍵語句 browser webdriver.firefox browser.maximize window 瀏覽器全屏顯示 browser.get list pages hubei 0 page elem browser.find element by name pagenum browser.f...

蓋德化工採集新方案

coding utf 8 created on sun may 15 20 41 32 2016 author daxiong import requests,bs4,csv,time,random,os 存放所有二級 filename combinedfile.csv 存放二級目錄 site h...

Python採集例項2

也許這裡會顯得多此一舉，但是基於我對解耦的強烈願望，我還是果斷地寫到檔案裡了。後面如果採用物件導向程式設計，重構起來是十分方便的。獲取網頁內容部分也是相對簡單的，但是需要把網頁的內容都儲存到乙個資料夾裡。這裡有幾個新的用法複製如下 os.getcwd 獲得當前資料夾路徑 os.path.sep ...

蓋得化工最新採集2

蓋得化工網 翻頁測試 頁碼框輸入頁碼

蓋德化工採集新方案

Python採集例項2

相關推薦

蓋得化工網翻頁測試頁碼框輸入頁碼