zoukankan      html  css  js  c++  java
  • Python动态网站的抓取

    网页下载器

    # coding:utf-8
    import requests
    import urllib2
    import sys
    type = sys.getfilesystemencoding()
    class HtmlDownloader(object):

    def download(slef, url):

    if url is None:
    return None

    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

    headers = {'User-Agent': user_agent}
    req = urllib2.Request(url, headers=headers)
    response = urllib2.urlopen(req)
    if response.getcode() == 200:
    html = response.read()
    return html


    return None

    网页解析器

    # coding:utf-8
    import re
    import json
    class HtmlParser(object):

    def parser_url(self, page_url, response):

    pattern = re.compile(r'(http://movie.mtime.com/(d+)/)')
    urls = pattern.findall(response)
    if urls != None:
    # 将urls进行去重
    return list(set(urls))
    else:
    return None

    # 解析异步响应值
    def parser_json(self, page_url, response):

    # 将"="和";"之间的内容提取出来
    pattern = re.compile(r'=(.*?);')
    result = pattern.findall(response)[0]

    if result != None:
    value = json.loads(result)
    try:
    isRelease = value.get('value').get('isRelease')
    except Exception, e:
    print e
    return None
    if isRelease:
    if value.get('value').get('releaseType') == None:
    return self._parser_release(page_url, value)
    else:
    return self._parser_no_release(page_url, value, isRelease=2)
    else:

    return self._parser_no_release(page_url, value)

    def _parser_release(self, page_url, value):

    try:
    isRelease = 1
    movieRating = value.get('value').get('movieRating')
    boxOffice = value.get('value').get('boxOffice')
    moveTitle = value.get('value').get('moveTitle')
    RPictureFinal = movieRating.get('RPictureFinal')
    RStoryFinal = movieRating.get('RStoryFinal')
    RDirectorFinal = movieRating.get('RDirectorFinal')
    ROtherFinal = movieRating.get('ROtherFinal')
    RathingFinal = movieRating.get('RarhingFinal')

    MovieId = movieRating.get('MoviedId')
    Usercount = movieRating.get('Usercount')
    AttitudeCount = movieRating.get('AttitudeCount')

    TotalBoxOffice = boxOffice.get('TotalBoxOffice')
    TotalBoxOfficeUnit = boxOffice.get('TotalBoxOfficeUnit')
    TodayBoxOffice = boxOffice.get('TodayBoxOffice')
    TodayBoxOfficeUnit = boxOffice.get('TodayBoxOfficeUnit')

    ShowDays = boxOffice.get('ShowDays')

    try:

    Rank = boxOffice.get('Rank')
    except Exception, e:
    Rank = 0

    return (
    MovieId, moveTitle, RathingFinal, ROtherFinal, RPictureFinal, RDirectorFinal, RStoryFinal, Usercount,
    AttitudeCount
    , TotalBoxOffice + TotalBoxOfficeUnit, TodayBoxOffice + TodayBoxOfficeUnit, Rank, ShowDays, isRelease)
    except Exception, e:
    print e, page_url, value

    return None

    # 解析未上映的电影信息
    def _parser_no_release(self, page_url, value, isRelease=0):

    try:
    movieRating = value.get('value').get('movieRating')
    moveTitle = value.get('value').get('movieTitle')
    RPictureFinal = movieRating.get('RPictureFinal')
    RStoryFinal = movieRating.get('RStoryFinal')
    RDirectorFinal = movieRating.get('RDirectorFinal')
    ROtherFinal = movieRating.get('ROtherFinal')
    RatingFinal = movieRating.get('RatingFinal')

    MovieId = movieRating.get('MovieId')
    Usercount = movieRating.get('Usercount')
    AttitudeCount = movieRating.get('AttitudeCount')

    try:

    Rank = 0

    except Exception, e:
    Rank =0
    return (
    MovieId, moveTitle, RatingFinal, ROtherFinal, RPictureFinal, RDirectorFinal, RStoryFinal,
    Usercount,
    AttitudeCount
    , u'无', u'无', Rank, 0, isRelease)

    except Exception, e:

    print e, page_url, value

    return None

     数据存储器

    # coding:utf-8
    import MySQLdb


    class DataOutput(object):

    def __init__(self):
    self.con =MySQLdb.connect(host='127.0.0.1', user='root', passwd='', db='go',port=3306,charset='utf8')
    self.cx = self.con.cursor()
    self.create_table('MTime')
    self.datas = []

    def create_table(self, table_name):

    values = "id int(11) not null primary key auto_increment,"
    "MovieId int(11),"
    "MovieTitle varchar(40) NOT NULL,"
    "RatingFinal double NOT NULL DEFAULT 0.0,"
    "ROtherFinal double NOT NULL DEFAULT 0.0,"
    "RPictureFinal double NOT NULL DEFAULT 0.0,"
    "RDirectorFinal double NOT NULL DEFAULT 0.0,"
    "RStoryFinal double NOT NULL DEFAULT 0.0,"
    "Usercount int(11) NOT NULL DEFAULT 0,"
    "AttitudeCount int(11) NOT NULL DEFAULT 0,"
    "TotalBoxOffice varchar(20) NOT NULL,"
    "TodayBoxOffice varchar(20) NOT NULL,"
    "Rank int(11) NOT NULL DEFAULT 0,"
    "ShowDays int(11) NOT NULL DEFAULT 0,"
    "isRelease int(11) NOT NULL"
    ""
    #print 'CREATE TABLE IF NOT EXISTS %s(%s)' % (table_name, values)

    self.cx.execute('CREATE TABLE IF NOT EXISTS %s(%s) ENGINE=InnoDB DEFAULT CHARSET=utf8' % (table_name, values))

    def store_data(self, data):

    if data is None:
    return
    self.datas.append(data)
    if len(self.datas) > 10:
    self.output_db('MTime')

    def output_db(self, table_name):
    for data in self.datas:
    self.cx.execute("INSERT INTO MTime (MovieId,MovieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,"
    "RStoryFinal,Usercount,AttitudeCount,TotalBoxOffice,TodayBoxOffice,Rank,ShowDays,isRelease) "
    "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",data)

    self.datas.remove(data)

    self.con.commit()
    self.con.close()

    def output_end(self):

    if len(self.datas) > 0:
    self.output_db('MTime')

    self.cx.close()

     爬虫调度器

    # coding:utf-8
    from UrlManager import UrlManager
    from DataOutput import DataOutput
    from HtmlDownloader import HtmlDownloader
    from HtmlParser import HtmlParser
    import time
    class SpiderMan(object):

    def __init__(self):

    self.downloader = HtmlDownloader()
    self.parser = HtmlParser()
    self.output = DataOutput()

    def crawl(self,root_url):

    content = self.downloader.download(root_url)

    urls = self.parser.parser_url(root_url,content)


    for url in urls:

    try:
    t= time.strftime("%Y%m%d%H%M%S3282",time.localtime())
    rank_url ="http://service.library.mtime.com/Movie.api?"
    "Ajax_CallBack=true"
    "&Ajax_CallBackType=Mtime.Library.Services"
    "&Ajax_CallBackMethod=GetMovieOverviewRating"
    "&Ajax_CrossDomain=1"
    "&Ajax_RequestUrl=%s"
    "&t=%s"
    "&Ajax_CallBackArgument0=%s" %(url[0],t,url[1])

    #print rank_url
    #exit()
    rank_content = self.downloader.download(rank_url)

    data = self.parser.parser_json(rank_url,rank_content)

    self.output.store_data(data)
    except Exception,e:
    print e
    self.output.output_end()
    print "Crawl finish"

    if __name__ == '__main__':

    spider = SpiderMan()
    spider.crawl('http://theater.mtime.com/China_Beijing/')

  • 相关阅读:
    用java实现的微信公众号爬虫
    装饰模式
    输入一个数,查询该数是否为素数
    Machine-learning-DecisionTree
    Circles of Waiting
    GRE阅读
    云数据库POLARDB产品解读之二:如何做到高性价比
    奉上一份云上数据安全保护指南
    从双十一看阿里云安全的“创世纪”——采访阿里云安全掌门人肖力有感
    阿里云移动研发平台EMAS,是如何连续5年安全护航双11的?
  • 原文地址:https://www.cnblogs.com/paulversion/p/8393842.html
Copyright © 2011-2022 走看看