zoukankan      html  css  js  c++  java
  • python爬虫获取豆瓣网前250部电影的详细信息

    网址 https://movie.douban.com/top250

    一共250部电影,有分页,获取每一部的详细信息

    不采用框架,使用 urilib读取网页,re进行正则表达式匹配,lxml进行xpath查找

     1 from film import *
     2 from urllib import request
     3 import time,re
     4 url=r'https://movie.douban.com/top250?start='
     5 for i in range(10):
     6     url=url+str(i*25)
     7     print(url)
     8     
     9     headers = {
    10         'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
    11                       r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
    12         'Connection': 'keep-alive'
    13     }
    14     req=request.Request(url,headers=headers)
    15     page=request.urlopen(req).read()
    16     page=page.decode('utf-8')
    17     #fp=open("page.txt",mode="w",encoding="UTF-8")
    18     #fp.writelines(page)
    19     p=re.compile(r'<emsclass="">d+</em>s*<ashref="https://movie.douban.com/subject/d+/">')
    20     result=p.findall(page)
    21     for item in result:
    22         #print(item)
    23         p=re.compile(r'd+')
    24         no=p.findall(item)
    25         #print(no[0])
    26         p=re.compile(r'https://movie.douban.com/subject/d+/')
    27         rurl=p.findall(item)
    28         #print(rurl[0])
    29         filma=film(no[0],rurl[0],'','','','','','')
    30         filma.getall()
    31         filma.detail()
    32         time.sleep(3)
    33     #print (result)
    34     time.sleep(3)
    35     #print(i)

    film.py 如果要做数据的持久化,在这里实现

     1 from urllib import request
     2 from lxml import etree
     3 class film:
     4     def __init__(self,no,url,name,year,score,director,classification,actor):
     5         self.name=name
     6         self.year=year
     7         self.score=score
     8         self.director=director
     9         self.classification=classification
    10         self.actor=actor
    11         self.url=url
    12         self.no=no
    13     
    14     def detail(self):
    15         temp = "No:%s;url:%s;片名:%s;年份:%s;分数:%s;导演:%s;分级:%s;演员:%s;"   %(self.no,self.url,self.name,self.year,self.score,self.director,self.classification,self.actor)  
    16         print(temp)
    17     def getall(self):
    18         headers={
    19         'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
    20                       r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
    21         'Connection': 'keep-alive'
    22         }
    23         req=request.Request(self.url,headers=headers)
    24         page=request.urlopen(req).read()
    25         page=page.decode('utf-8')
    26         selector=etree.HTML(page)
    27         print (page)
    28         self.name=selector.xpath('/html/body/div[3]/div[1]/h1/span[1]/text()')
    29         self.year=selector.xpath('//*[@id="content"]/h1/span[2]/text()')
    30         self.score=selector.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')
    31         self.director=selector.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')
    32         self.classification=selector.xpath('//*[@id="info"]/span[5]/text()')
    33         self.actor=selector.xpath('//*[@id="info"]/span[3]/span[2]/a/text()')
    34         
    35         
  • 相关阅读:
    uwsgi
    Angular.js中处理页面闪烁的方法详解
    Mongo db change datadir
    day 007作业
    day 007总结
    day 006作业
    day006 总结
    day 005作业
    day 005总结
    day 004作业
  • 原文地址:https://www.cnblogs.com/newvoyage/p/7043682.html
Copyright © 2011-2022 走看看