zoukankan      html  css  js  c++  java
  • Python动态网站的抓取

    网页下载器

    # coding:utf-8
    import requests
    import urllib2
    import sys
    type = sys.getfilesystemencoding()
    class HtmlDownloader(object):

    def download(slef, url):

    if url is None:
    return None

    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

    headers = {'User-Agent': user_agent}
    req = urllib2.Request(url, headers=headers)
    response = urllib2.urlopen(req)
    if response.getcode() == 200:
    html = response.read()
    return html


    return None

    网页解析器

    # coding:utf-8
    import re
    import json
    class HtmlParser(object):

    def parser_url(self, page_url, response):

    pattern = re.compile(r'(http://movie.mtime.com/(d+)/)')
    urls = pattern.findall(response)
    if urls != None:
    # 将urls进行去重
    return list(set(urls))
    else:
    return None

    # 解析异步响应值
    def parser_json(self, page_url, response):

    # 将"="和";"之间的内容提取出来
    pattern = re.compile(r'=(.*?);')
    result = pattern.findall(response)[0]

    if result != None:
    value = json.loads(result)
    try:
    isRelease = value.get('value').get('isRelease')
    except Exception, e:
    print e
    return None
    if isRelease:
    if value.get('value').get('releaseType') == None:
    return self._parser_release(page_url, value)
    else:
    return self._parser_no_release(page_url, value, isRelease=2)
    else:

    return self._parser_no_release(page_url, value)

    def _parser_release(self, page_url, value):

    try:
    isRelease = 1
    movieRating = value.get('value').get('movieRating')
    boxOffice = value.get('value').get('boxOffice')
    moveTitle = value.get('value').get('moveTitle')
    RPictureFinal = movieRating.get('RPictureFinal')
    RStoryFinal = movieRating.get('RStoryFinal')
    RDirectorFinal = movieRating.get('RDirectorFinal')
    ROtherFinal = movieRating.get('ROtherFinal')
    RathingFinal = movieRating.get('RarhingFinal')

    MovieId = movieRating.get('MoviedId')
    Usercount = movieRating.get('Usercount')
    AttitudeCount = movieRating.get('AttitudeCount')

    TotalBoxOffice = boxOffice.get('TotalBoxOffice')
    TotalBoxOfficeUnit = boxOffice.get('TotalBoxOfficeUnit')
    TodayBoxOffice = boxOffice.get('TodayBoxOffice')
    TodayBoxOfficeUnit = boxOffice.get('TodayBoxOfficeUnit')

    ShowDays = boxOffice.get('ShowDays')

    try:

    Rank = boxOffice.get('Rank')
    except Exception, e:
    Rank = 0

    return (
    MovieId, moveTitle, RathingFinal, ROtherFinal, RPictureFinal, RDirectorFinal, RStoryFinal, Usercount,
    AttitudeCount
    , TotalBoxOffice + TotalBoxOfficeUnit, TodayBoxOffice + TodayBoxOfficeUnit, Rank, ShowDays, isRelease)
    except Exception, e:
    print e, page_url, value

    return None

    # 解析未上映的电影信息
    def _parser_no_release(self, page_url, value, isRelease=0):

    try:
    movieRating = value.get('value').get('movieRating')
    moveTitle = value.get('value').get('movieTitle')
    RPictureFinal = movieRating.get('RPictureFinal')
    RStoryFinal = movieRating.get('RStoryFinal')
    RDirectorFinal = movieRating.get('RDirectorFinal')
    ROtherFinal = movieRating.get('ROtherFinal')
    RatingFinal = movieRating.get('RatingFinal')

    MovieId = movieRating.get('MovieId')
    Usercount = movieRating.get('Usercount')
    AttitudeCount = movieRating.get('AttitudeCount')

    try:

    Rank = 0

    except Exception, e:
    Rank =0
    return (
    MovieId, moveTitle, RatingFinal, ROtherFinal, RPictureFinal, RDirectorFinal, RStoryFinal,
    Usercount,
    AttitudeCount
    , u'无', u'无', Rank, 0, isRelease)

    except Exception, e:

    print e, page_url, value

    return None

     数据存储器

    # coding:utf-8
    import MySQLdb


    class DataOutput(object):

    def __init__(self):
    self.con =MySQLdb.connect(host='127.0.0.1', user='root', passwd='', db='go',port=3306,charset='utf8')
    self.cx = self.con.cursor()
    self.create_table('MTime')
    self.datas = []

    def create_table(self, table_name):

    values = "id int(11) not null primary key auto_increment,"
    "MovieId int(11),"
    "MovieTitle varchar(40) NOT NULL,"
    "RatingFinal double NOT NULL DEFAULT 0.0,"
    "ROtherFinal double NOT NULL DEFAULT 0.0,"
    "RPictureFinal double NOT NULL DEFAULT 0.0,"
    "RDirectorFinal double NOT NULL DEFAULT 0.0,"
    "RStoryFinal double NOT NULL DEFAULT 0.0,"
    "Usercount int(11) NOT NULL DEFAULT 0,"
    "AttitudeCount int(11) NOT NULL DEFAULT 0,"
    "TotalBoxOffice varchar(20) NOT NULL,"
    "TodayBoxOffice varchar(20) NOT NULL,"
    "Rank int(11) NOT NULL DEFAULT 0,"
    "ShowDays int(11) NOT NULL DEFAULT 0,"
    "isRelease int(11) NOT NULL"
    ""
    #print 'CREATE TABLE IF NOT EXISTS %s(%s)' % (table_name, values)

    self.cx.execute('CREATE TABLE IF NOT EXISTS %s(%s) ENGINE=InnoDB DEFAULT CHARSET=utf8' % (table_name, values))

    def store_data(self, data):

    if data is None:
    return
    self.datas.append(data)
    if len(self.datas) > 10:
    self.output_db('MTime')

    def output_db(self, table_name):
    for data in self.datas:
    self.cx.execute("INSERT INTO MTime (MovieId,MovieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,"
    "RStoryFinal,Usercount,AttitudeCount,TotalBoxOffice,TodayBoxOffice,Rank,ShowDays,isRelease) "
    "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",data)

    self.datas.remove(data)

    self.con.commit()
    self.con.close()

    def output_end(self):

    if len(self.datas) > 0:
    self.output_db('MTime')

    self.cx.close()

     爬虫调度器

    # coding:utf-8
    from UrlManager import UrlManager
    from DataOutput import DataOutput
    from HtmlDownloader import HtmlDownloader
    from HtmlParser import HtmlParser
    import time
    class SpiderMan(object):

    def __init__(self):

    self.downloader = HtmlDownloader()
    self.parser = HtmlParser()
    self.output = DataOutput()

    def crawl(self,root_url):

    content = self.downloader.download(root_url)

    urls = self.parser.parser_url(root_url,content)


    for url in urls:

    try:
    t= time.strftime("%Y%m%d%H%M%S3282",time.localtime())
    rank_url ="http://service.library.mtime.com/Movie.api?"
    "Ajax_CallBack=true"
    "&Ajax_CallBackType=Mtime.Library.Services"
    "&Ajax_CallBackMethod=GetMovieOverviewRating"
    "&Ajax_CrossDomain=1"
    "&Ajax_RequestUrl=%s"
    "&t=%s"
    "&Ajax_CallBackArgument0=%s" %(url[0],t,url[1])

    #print rank_url
    #exit()
    rank_content = self.downloader.download(rank_url)

    data = self.parser.parser_json(rank_url,rank_content)

    self.output.store_data(data)
    except Exception,e:
    print e
    self.output.output_end()
    print "Crawl finish"

    if __name__ == '__main__':

    spider = SpiderMan()
    spider.crawl('http://theater.mtime.com/China_Beijing/')

  • 相关阅读:
    深入分析Spring之IOC之加载BeanDefinition案例详解
    JDK10的新特性:var和匿名类如何运用?正确的案例讲解
    SpringMVC中如何获取请求参数?案例详解
    如何用Spring Boot集成Ehcache缓存,教你三招搞定
    基础练习-4.数列特征
    基础练习-3.字母图形
    基础练习-2. 01字串
    基础练习-1.闰年判断
    入门训练-4. Fibonacci数列
    入门训练-3.圆的面积
  • 原文地址:https://www.cnblogs.com/paulversion/p/8393842.html
Copyright © 2011-2022 走看看