zoukankan      html  css  js  c++  java
  • Python动态网站的抓取

    网页下载器

    # coding:utf-8
    import requests
    import urllib2
    import sys
    type = sys.getfilesystemencoding()
    class HtmlDownloader(object):

    def download(slef, url):

    if url is None:
    return None

    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

    headers = {'User-Agent': user_agent}
    req = urllib2.Request(url, headers=headers)
    response = urllib2.urlopen(req)
    if response.getcode() == 200:
    html = response.read()
    return html


    return None

    网页解析器

    # coding:utf-8
    import re
    import json
    class HtmlParser(object):

    def parser_url(self, page_url, response):

    pattern = re.compile(r'(http://movie.mtime.com/(d+)/)')
    urls = pattern.findall(response)
    if urls != None:
    # 将urls进行去重
    return list(set(urls))
    else:
    return None

    # 解析异步响应值
    def parser_json(self, page_url, response):

    # 将"="和";"之间的内容提取出来
    pattern = re.compile(r'=(.*?);')
    result = pattern.findall(response)[0]

    if result != None:
    value = json.loads(result)
    try:
    isRelease = value.get('value').get('isRelease')
    except Exception, e:
    print e
    return None
    if isRelease:
    if value.get('value').get('releaseType') == None:
    return self._parser_release(page_url, value)
    else:
    return self._parser_no_release(page_url, value, isRelease=2)
    else:

    return self._parser_no_release(page_url, value)

    def _parser_release(self, page_url, value):

    try:
    isRelease = 1
    movieRating = value.get('value').get('movieRating')
    boxOffice = value.get('value').get('boxOffice')
    moveTitle = value.get('value').get('moveTitle')
    RPictureFinal = movieRating.get('RPictureFinal')
    RStoryFinal = movieRating.get('RStoryFinal')
    RDirectorFinal = movieRating.get('RDirectorFinal')
    ROtherFinal = movieRating.get('ROtherFinal')
    RathingFinal = movieRating.get('RarhingFinal')

    MovieId = movieRating.get('MoviedId')
    Usercount = movieRating.get('Usercount')
    AttitudeCount = movieRating.get('AttitudeCount')

    TotalBoxOffice = boxOffice.get('TotalBoxOffice')
    TotalBoxOfficeUnit = boxOffice.get('TotalBoxOfficeUnit')
    TodayBoxOffice = boxOffice.get('TodayBoxOffice')
    TodayBoxOfficeUnit = boxOffice.get('TodayBoxOfficeUnit')

    ShowDays = boxOffice.get('ShowDays')

    try:

    Rank = boxOffice.get('Rank')
    except Exception, e:
    Rank = 0

    return (
    MovieId, moveTitle, RathingFinal, ROtherFinal, RPictureFinal, RDirectorFinal, RStoryFinal, Usercount,
    AttitudeCount
    , TotalBoxOffice + TotalBoxOfficeUnit, TodayBoxOffice + TodayBoxOfficeUnit, Rank, ShowDays, isRelease)
    except Exception, e:
    print e, page_url, value

    return None

    # 解析未上映的电影信息
    def _parser_no_release(self, page_url, value, isRelease=0):

    try:
    movieRating = value.get('value').get('movieRating')
    moveTitle = value.get('value').get('movieTitle')
    RPictureFinal = movieRating.get('RPictureFinal')
    RStoryFinal = movieRating.get('RStoryFinal')
    RDirectorFinal = movieRating.get('RDirectorFinal')
    ROtherFinal = movieRating.get('ROtherFinal')
    RatingFinal = movieRating.get('RatingFinal')

    MovieId = movieRating.get('MovieId')
    Usercount = movieRating.get('Usercount')
    AttitudeCount = movieRating.get('AttitudeCount')

    try:

    Rank = 0

    except Exception, e:
    Rank =0
    return (
    MovieId, moveTitle, RatingFinal, ROtherFinal, RPictureFinal, RDirectorFinal, RStoryFinal,
    Usercount,
    AttitudeCount
    , u'无', u'无', Rank, 0, isRelease)

    except Exception, e:

    print e, page_url, value

    return None

     数据存储器

    # coding:utf-8
    import MySQLdb


    class DataOutput(object):

    def __init__(self):
    self.con =MySQLdb.connect(host='127.0.0.1', user='root', passwd='', db='go',port=3306,charset='utf8')
    self.cx = self.con.cursor()
    self.create_table('MTime')
    self.datas = []

    def create_table(self, table_name):

    values = "id int(11) not null primary key auto_increment,"
    "MovieId int(11),"
    "MovieTitle varchar(40) NOT NULL,"
    "RatingFinal double NOT NULL DEFAULT 0.0,"
    "ROtherFinal double NOT NULL DEFAULT 0.0,"
    "RPictureFinal double NOT NULL DEFAULT 0.0,"
    "RDirectorFinal double NOT NULL DEFAULT 0.0,"
    "RStoryFinal double NOT NULL DEFAULT 0.0,"
    "Usercount int(11) NOT NULL DEFAULT 0,"
    "AttitudeCount int(11) NOT NULL DEFAULT 0,"
    "TotalBoxOffice varchar(20) NOT NULL,"
    "TodayBoxOffice varchar(20) NOT NULL,"
    "Rank int(11) NOT NULL DEFAULT 0,"
    "ShowDays int(11) NOT NULL DEFAULT 0,"
    "isRelease int(11) NOT NULL"
    ""
    #print 'CREATE TABLE IF NOT EXISTS %s(%s)' % (table_name, values)

    self.cx.execute('CREATE TABLE IF NOT EXISTS %s(%s) ENGINE=InnoDB DEFAULT CHARSET=utf8' % (table_name, values))

    def store_data(self, data):

    if data is None:
    return
    self.datas.append(data)
    if len(self.datas) > 10:
    self.output_db('MTime')

    def output_db(self, table_name):
    for data in self.datas:
    self.cx.execute("INSERT INTO MTime (MovieId,MovieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,"
    "RStoryFinal,Usercount,AttitudeCount,TotalBoxOffice,TodayBoxOffice,Rank,ShowDays,isRelease) "
    "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",data)

    self.datas.remove(data)

    self.con.commit()
    self.con.close()

    def output_end(self):

    if len(self.datas) > 0:
    self.output_db('MTime')

    self.cx.close()

     爬虫调度器

    # coding:utf-8
    from UrlManager import UrlManager
    from DataOutput import DataOutput
    from HtmlDownloader import HtmlDownloader
    from HtmlParser import HtmlParser
    import time
    class SpiderMan(object):

    def __init__(self):

    self.downloader = HtmlDownloader()
    self.parser = HtmlParser()
    self.output = DataOutput()

    def crawl(self,root_url):

    content = self.downloader.download(root_url)

    urls = self.parser.parser_url(root_url,content)


    for url in urls:

    try:
    t= time.strftime("%Y%m%d%H%M%S3282",time.localtime())
    rank_url ="http://service.library.mtime.com/Movie.api?"
    "Ajax_CallBack=true"
    "&Ajax_CallBackType=Mtime.Library.Services"
    "&Ajax_CallBackMethod=GetMovieOverviewRating"
    "&Ajax_CrossDomain=1"
    "&Ajax_RequestUrl=%s"
    "&t=%s"
    "&Ajax_CallBackArgument0=%s" %(url[0],t,url[1])

    #print rank_url
    #exit()
    rank_content = self.downloader.download(rank_url)

    data = self.parser.parser_json(rank_url,rank_content)

    self.output.store_data(data)
    except Exception,e:
    print e
    self.output.output_end()
    print "Crawl finish"

    if __name__ == '__main__':

    spider = SpiderMan()
    spider.crawl('http://theater.mtime.com/China_Beijing/')

  • 相关阅读:
    Spark中RDD、DataFrame和DataSet的区别
    如何为Spark应用程序分配--num-executors,--execuor-cores和--executor-memory
    一些常用的Spark SQL调优技巧
    使用sendmail命令发送附件
    spark.sql.shuffle.partitions 和 spark.default.parallelism 的区别
    Spark Shuffle
    [Spark学习] Spark RDD详解
    将时间戳(timestamp)转换为MongoDB中的ObjectId
    如何使用pig的AvroStorage存储array/map类型
    关于Avro中的Unions类型
  • 原文地址:https://www.cnblogs.com/paulversion/p/8393842.html
Copyright © 2011-2022 走看看