BS4爬取豆瓣電影

2022-05-27 07:33:12 字數 3968 閱讀 9142

爬取豆瓣top250部電影

####建立表:

#connect.py

from sqlalchemy import create_engine

# hostname='localhost'

# port='3306'

# username='root'

# password='123456'

# database='douban'

db_url='mysql+pymysql://root:123456@localhost:3306/douban?charset=utf8'

engine=create_engine(db_url)

#建立映像

from sqlalchemy.ext.declarative import declarative_base

base=declarative_base(engine)

#建立會話

from sqlalchemy.orm import sessionmaker

session=sessionmaker(engine)

session=session()

##################建立表

from sqlalchemy import column,string,integer,datetime

from datetime import datetime

class douban(base):

__tablename__='douban'

id=column(integer,primary_key=true,autoincrement=true)

name=column(string(50))

author=column(string(100),nullable=true)

actor=column(string(100))

time=column(string(50))

country=column(string(100))

type=column(string(100))

createtime=column(datetime,default=datetime.now)

def __repr__(self):

return ''%(

self.id,

self.name,

self.author,

self.actor,

self.time,

self.country,

self.type,

self.creatime

)if __name__=='__main__':

base.metadata.create_all()

# user=douban()

# user.type='你好'

# user.country='你'

# user.author='666'

# user.actor='你好啊'

# session.add(user)

# session.commit()

###爬取資料並儲存到資料庫:

#douban.py

import requests,re

from bs4 import beautifulsoup

import time,datetime

# import pymysql

# conn=pymysql.connect(host='127.0.0.1',user='root',passwd='123456',db='mysql',charset='utf8')

# cur=conn.cursor()

# cur.execute('use douban;')

# cur.execute("insert into douban.douban(author,actor,country) values('aa','bb','bb')")

# conn.commit()

#匯入sqlalchemy

from connect import douban,session

headers={'referer':'',

'user-agent': 'mozilla/5.0 (windows nt 10.0; '

def get_html(x):

num = 0

for n in range(x+1):

url=''%(n*25)

html=requests.get(url,headers=headers).text

soup=beautifulsoup(html,'lxml')

# print(type(soup))

content_all=soup.select('div[class="item"]')

for m in content_all:

num+=1

title=m.select('span[class="title"]')[0].string

print(title)

content=m.select('div[class="bd"] > p[class=""]')[0]

#返回字串迭代器

text=content.stripped_strings

li =

for i in text:

i=str(i)

# print(i)

print(li)

#獲取演員和國家列表

author_list=li[0].split('\xa0\xa0\xa0')

country_list=li[1].split('\xa0/\xa0')

# print(author_list)

# print(country_list)

#從列表取出資料

author=author_list[0]

actor=author_list[1]

time=country_list[0]

country=country_list[1]

type=country_list[2]

print(author)

print(actor)

print(time)

print(country,type+'\n\n')

print('總共獲取%s' % num)

#第一種插入方式特別注意,此處用單雙引號來區分內容,且%s要加引號,否則會報錯

# sql="insert into douban(name,author,actor,time,country,type) values('%s','%s','%s','%s','%s','%s')"%(

# title,

# author,

# actor,

# time,

# country,

# type

# )# cur.execute(sql)

# conn.commit()

### 第二種插入方法,使用sqlalchemy插入

data=douban(name=title,

author=author,

actor=actor,

#字串格式需轉換成日期格式

time=time,

# time=datetime.strptime(time,'%y')

country=country,

type=type,

)session.add(data)

session.commit()

if __name__=='__main__':

x=input('輸入數字:')

bs4爬取網頁基礎

import requests from bs4 import beautifulsoup def getsoup url try r requests.get url,timeout 30 r.raise for status print r.text 很亂 soup beautifulsoup ...

豆瓣熱門電影爬取

import requests import json import csv defgetonepagedata page start url headers params response requests.get start url,headers headers,params params i...

爬蟲 bs4 爬取扇貝 python 單詞書

本例不涉及cookie,即抓取的資源無需登入認證。爬蟲主要做兩件事,乙個是抓取請求鏈結,另乙個是分析響應的資料。鑑於扇貝單詞書的詞串頁中的頁碼是通過js動態生成,直接抓取頁面內容是不能獲取,因此程式模擬了它的分頁請求路徑 向目標url發請求,拉取響應體 分析頁面 爬取指定內容 抓取鏈結 分析資料 資...