zoukankan      html  css  js  c++  java
  • Python动态网站的抓取

    网页下载器

    # coding:utf-8
    import requests
    import urllib2
    import sys
    type = sys.getfilesystemencoding()
    class HtmlDownloader(object):

    def download(slef, url):

    if url is None:
    return None

    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

    headers = {'User-Agent': user_agent}
    req = urllib2.Request(url, headers=headers)
    response = urllib2.urlopen(req)
    if response.getcode() == 200:
    html = response.read()
    return html


    return None

    网页解析器

    # coding:utf-8
    import re
    import json
    class HtmlParser(object):

    def parser_url(self, page_url, response):

    pattern = re.compile(r'(http://movie.mtime.com/(d+)/)')
    urls = pattern.findall(response)
    if urls != None:
    # 将urls进行去重
    return list(set(urls))
    else:
    return None

    # 解析异步响应值
    def parser_json(self, page_url, response):

    # 将"="和";"之间的内容提取出来
    pattern = re.compile(r'=(.*?);')
    result = pattern.findall(response)[0]

    if result != None:
    value = json.loads(result)
    try:
    isRelease = value.get('value').get('isRelease')
    except Exception, e:
    print e
    return None
    if isRelease:
    if value.get('value').get('releaseType') == None:
    return self._parser_release(page_url, value)
    else:
    return self._parser_no_release(page_url, value, isRelease=2)
    else:

    return self._parser_no_release(page_url, value)

    def _parser_release(self, page_url, value):

    try:
    isRelease = 1
    movieRating = value.get('value').get('movieRating')
    boxOffice = value.get('value').get('boxOffice')
    moveTitle = value.get('value').get('moveTitle')
    RPictureFinal = movieRating.get('RPictureFinal')
    RStoryFinal = movieRating.get('RStoryFinal')
    RDirectorFinal = movieRating.get('RDirectorFinal')
    ROtherFinal = movieRating.get('ROtherFinal')
    RathingFinal = movieRating.get('RarhingFinal')

    MovieId = movieRating.get('MoviedId')
    Usercount = movieRating.get('Usercount')
    AttitudeCount = movieRating.get('AttitudeCount')

    TotalBoxOffice = boxOffice.get('TotalBoxOffice')
    TotalBoxOfficeUnit = boxOffice.get('TotalBoxOfficeUnit')
    TodayBoxOffice = boxOffice.get('TodayBoxOffice')
    TodayBoxOfficeUnit = boxOffice.get('TodayBoxOfficeUnit')

    ShowDays = boxOffice.get('ShowDays')

    try:

    Rank = boxOffice.get('Rank')
    except Exception, e:
    Rank = 0

    return (
    MovieId, moveTitle, RathingFinal, ROtherFinal, RPictureFinal, RDirectorFinal, RStoryFinal, Usercount,
    AttitudeCount
    , TotalBoxOffice + TotalBoxOfficeUnit, TodayBoxOffice + TodayBoxOfficeUnit, Rank, ShowDays, isRelease)
    except Exception, e:
    print e, page_url, value

    return None

    # 解析未上映的电影信息
    def _parser_no_release(self, page_url, value, isRelease=0):

    try:
    movieRating = value.get('value').get('movieRating')
    moveTitle = value.get('value').get('movieTitle')
    RPictureFinal = movieRating.get('RPictureFinal')
    RStoryFinal = movieRating.get('RStoryFinal')
    RDirectorFinal = movieRating.get('RDirectorFinal')
    ROtherFinal = movieRating.get('ROtherFinal')
    RatingFinal = movieRating.get('RatingFinal')

    MovieId = movieRating.get('MovieId')
    Usercount = movieRating.get('Usercount')
    AttitudeCount = movieRating.get('AttitudeCount')

    try:

    Rank = 0

    except Exception, e:
    Rank =0
    return (
    MovieId, moveTitle, RatingFinal, ROtherFinal, RPictureFinal, RDirectorFinal, RStoryFinal,
    Usercount,
    AttitudeCount
    , u'无', u'无', Rank, 0, isRelease)

    except Exception, e:

    print e, page_url, value

    return None

     数据存储器

    # coding:utf-8
    import MySQLdb


    class DataOutput(object):

    def __init__(self):
    self.con =MySQLdb.connect(host='127.0.0.1', user='root', passwd='', db='go',port=3306,charset='utf8')
    self.cx = self.con.cursor()
    self.create_table('MTime')
    self.datas = []

    def create_table(self, table_name):

    values = "id int(11) not null primary key auto_increment,"
    "MovieId int(11),"
    "MovieTitle varchar(40) NOT NULL,"
    "RatingFinal double NOT NULL DEFAULT 0.0,"
    "ROtherFinal double NOT NULL DEFAULT 0.0,"
    "RPictureFinal double NOT NULL DEFAULT 0.0,"
    "RDirectorFinal double NOT NULL DEFAULT 0.0,"
    "RStoryFinal double NOT NULL DEFAULT 0.0,"
    "Usercount int(11) NOT NULL DEFAULT 0,"
    "AttitudeCount int(11) NOT NULL DEFAULT 0,"
    "TotalBoxOffice varchar(20) NOT NULL,"
    "TodayBoxOffice varchar(20) NOT NULL,"
    "Rank int(11) NOT NULL DEFAULT 0,"
    "ShowDays int(11) NOT NULL DEFAULT 0,"
    "isRelease int(11) NOT NULL"
    ""
    #print 'CREATE TABLE IF NOT EXISTS %s(%s)' % (table_name, values)

    self.cx.execute('CREATE TABLE IF NOT EXISTS %s(%s) ENGINE=InnoDB DEFAULT CHARSET=utf8' % (table_name, values))

    def store_data(self, data):

    if data is None:
    return
    self.datas.append(data)
    if len(self.datas) > 10:
    self.output_db('MTime')

    def output_db(self, table_name):
    for data in self.datas:
    self.cx.execute("INSERT INTO MTime (MovieId,MovieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,"
    "RStoryFinal,Usercount,AttitudeCount,TotalBoxOffice,TodayBoxOffice,Rank,ShowDays,isRelease) "
    "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",data)

    self.datas.remove(data)

    self.con.commit()
    self.con.close()

    def output_end(self):

    if len(self.datas) > 0:
    self.output_db('MTime')

    self.cx.close()

     爬虫调度器

    # coding:utf-8
    from UrlManager import UrlManager
    from DataOutput import DataOutput
    from HtmlDownloader import HtmlDownloader
    from HtmlParser import HtmlParser
    import time
    class SpiderMan(object):

    def __init__(self):

    self.downloader = HtmlDownloader()
    self.parser = HtmlParser()
    self.output = DataOutput()

    def crawl(self,root_url):

    content = self.downloader.download(root_url)

    urls = self.parser.parser_url(root_url,content)


    for url in urls:

    try:
    t= time.strftime("%Y%m%d%H%M%S3282",time.localtime())
    rank_url ="http://service.library.mtime.com/Movie.api?"
    "Ajax_CallBack=true"
    "&Ajax_CallBackType=Mtime.Library.Services"
    "&Ajax_CallBackMethod=GetMovieOverviewRating"
    "&Ajax_CrossDomain=1"
    "&Ajax_RequestUrl=%s"
    "&t=%s"
    "&Ajax_CallBackArgument0=%s" %(url[0],t,url[1])

    #print rank_url
    #exit()
    rank_content = self.downloader.download(rank_url)

    data = self.parser.parser_json(rank_url,rank_content)

    self.output.store_data(data)
    except Exception,e:
    print e
    self.output.output_end()
    print "Crawl finish"

    if __name__ == '__main__':

    spider = SpiderMan()
    spider.crawl('http://theater.mtime.com/China_Beijing/')

  • 相关阅读:
    批量刷新远程物化视图的方法(备用)
    Oracle 11g中CTE应用示例
    PL/SQL DEVELOPER中查询结果复制出来中文乱码的解决方案
    RHEL5.5 64位下安装Oracle 11g 64位安装前置条件的两种方法
    不良言论屏蔽方案探讨——自说自话方案
    AWWWB 网站克隆器 v2.0发布
    OneNote中到底能放多少种东西?
    Visual Studio 2010 旗舰版 安装问题小记
    WPF命中测试示例(一)——坐标点命中测试
    网页内嵌Windows Media Player播放器的多文件播放方法
  • 原文地址:https://www.cnblogs.com/paulversion/p/8393842.html
Copyright © 2011-2022 走看看