需要的库:
pyquery
requests
time
re
pymysql
比较简单,所以直接上源码:
from pyquery import PyQuery as pq import requests import time import re import pymysql headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0', } def get_one_page(url,n): a=0 res=requests.get(url,headers=headers) html=res.text doc=pq(html) urls=doc('div.hd').items() for item in urls: a=a+1 fid=n*25+a one_url=item.children('a').attr('href') get_one_film(one_url,fid) time.sleep(0.8) def get_one_film(url,fid): d={} print(fid) try: s=requests.session() s.keep_alive=False res=requests.get(url,headers=headers) html=res.text doc=pq(html) intr=doc('#link-report') except: print("链接失败,跳过链接") return #编号 d['id']=fid #标题 ftitle=doc('[property*="itemreviewed"]').text() d['title']=ftitle #导演 fdirector=doc('[rel*="directedBy"]').text() d['director']=fdirector #编剧 fscriptwriter=doc('[rel*="directedBy"]').parent().parent().next().next().text()[3:].strip() d['scriptwriter']=fscriptwriter #演员 factor=doc('#info>.actor').text()[3:].strip() d['actor']=factor #类型 ftype=doc('[property="v:genre"]').text() d['type']=ftype #地区 reg_region='制片国家/地区:</span>(.*?)<br/>' fregion=pat_findall(reg_region,html) d['region']=fregion #语言 reg_language='语言:</span>(.*?)<br/>' flanguage=pat_findall(reg_language,html) d['language']=flanguage #上映日期 fdate=doc('[property="v:initialReleaseDate"]').text() d['date']=fdate #时长 fduration=doc('[property="v:runtime"]').text() d['duration']=fduration #评分 fscore=doc('[property="v:average"]').text() d['score']=fscore #简介 fintr=doc('[property="v:summary"]').text() d['intr']=fintr #海报 fposter=doc('[class="nbgnbg"]>img').attr('src') d['poster']=fposter print(d) print(' ') #执行插入数据库操作 db=pymysql.connect(host='127.0.0.1',port=3306,user='root',password='',db='douban') cursor=db.cursor() sql='INSERT INTO top250(id,title,director,scriptwriter,actor,type,region,language,date,duration,score,intr,poster) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' try: cursor.execute(sql,(fid,ftitle,fdirector,fscriptwriter,factor,ftype,fregion,flanguage,fdate,fduration,fscore,fintr,fposter)) print('写入成功') db.commit() except: print('写入失败') db.rollback() db.close() def pat_findall(reg,reg_str): pattern=re.compile(reg,re.S) mat=re.findall(pattern,reg_str) un="unknow" if(len(mat)>0): return mat[0] else: return un def main(): for n in range(0,10): str1=str(n*25) url="https://movie.douban.com/top250?start="+str1+"&filter=" get_one_page(url,n) main()