Python动态网站的抓取

zoukankan html css js c++ java

Python动态网站的抓取

网页下载器

# coding:utf-8
import requests
import urllib2
import sys
type = sys.getfilesystemencoding()
class HtmlDownloader(object):

def download(slef, url):

if url is None:
return None

user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

headers = {'User-Agent': user_agent}
req = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(req)
if response.getcode() == 200:
html = response.read()
return html

return None

网页解析器

# coding:utf-8
import re
import json
class HtmlParser(object):

def parser_url(self, page_url, response):

pattern = re.compile(r'(http://movie.mtime.com/(d+)/)')
urls = pattern.findall(response)
if urls != None:
# 将urls进行去重
return list(set(urls))
else:
return None

# 解析异步响应值
def parser_json(self, page_url, response):

# 将"="和";"之间的内容提取出来
pattern = re.compile(r'=(.*?);')
result = pattern.findall(response)[0]

if result != None:
value = json.loads(result)
try:
isRelease = value.get('value').get('isRelease')
except Exception, e:
print e
return None
if isRelease:
if value.get('value').get('releaseType') == None:
return self._parser_release(page_url, value)
else:
return self._parser_no_release(page_url, value, isRelease=2)
else:

return self._parser_no_release(page_url, value)

def _parser_release(self, page_url, value):

try:
isRelease = 1
movieRating = value.get('value').get('movieRating')
boxOffice = value.get('value').get('boxOffice')
moveTitle = value.get('value').get('moveTitle')
RPictureFinal = movieRating.get('RPictureFinal')
RStoryFinal = movieRating.get('RStoryFinal')
RDirectorFinal = movieRating.get('RDirectorFinal')
ROtherFinal = movieRating.get('ROtherFinal')
RathingFinal = movieRating.get('RarhingFinal')

MovieId = movieRating.get('MoviedId')
Usercount = movieRating.get('Usercount')
AttitudeCount = movieRating.get('AttitudeCount')

TotalBoxOffice = boxOffice.get('TotalBoxOffice')
TotalBoxOfficeUnit = boxOffice.get('TotalBoxOfficeUnit')
TodayBoxOffice = boxOffice.get('TodayBoxOffice')
TodayBoxOfficeUnit = boxOffice.get('TodayBoxOfficeUnit')

ShowDays = boxOffice.get('ShowDays')

try:

Rank = boxOffice.get('Rank')
except Exception, e:
Rank = 0

return (
MovieId, moveTitle, RathingFinal, ROtherFinal, RPictureFinal, RDirectorFinal, RStoryFinal, Usercount,
AttitudeCount
, TotalBoxOffice + TotalBoxOfficeUnit, TodayBoxOffice + TodayBoxOfficeUnit, Rank, ShowDays, isRelease)
except Exception, e:
print e, page_url, value

return None

# 解析未上映的电影信息
def _parser_no_release(self, page_url, value, isRelease=0):

try:
movieRating = value.get('value').get('movieRating')
moveTitle = value.get('value').get('movieTitle')
RPictureFinal = movieRating.get('RPictureFinal')
RStoryFinal = movieRating.get('RStoryFinal')
RDirectorFinal = movieRating.get('RDirectorFinal')
ROtherFinal = movieRating.get('ROtherFinal')
RatingFinal = movieRating.get('RatingFinal')

MovieId = movieRating.get('MovieId')
Usercount = movieRating.get('Usercount')
AttitudeCount = movieRating.get('AttitudeCount')

try:

Rank = 0

except Exception, e:
Rank =0
return (
MovieId, moveTitle, RatingFinal, ROtherFinal, RPictureFinal, RDirectorFinal, RStoryFinal,
Usercount,
AttitudeCount
, u'无', u'无', Rank, 0, isRelease)

except Exception, e:

print e, page_url, value

return None

数据存储器

# coding:utf-8
import MySQLdb

class DataOutput(object):

def __init__(self):
self.con =MySQLdb.connect(host='127.0.0.1', user='root', passwd='', db='go',port=3306,charset='utf8')
self.cx = self.con.cursor()
self.create_table('MTime')
self.datas = []

def create_table(self, table_name):

values = "id int(11) not null primary key auto_increment,"
"MovieId int(11),"
"MovieTitle varchar(40) NOT NULL,"
"RatingFinal double NOT NULL DEFAULT 0.0,"
"ROtherFinal double NOT NULL DEFAULT 0.0,"
"RPictureFinal double NOT NULL DEFAULT 0.0,"
"RDirectorFinal double NOT NULL DEFAULT 0.0,"
"RStoryFinal double NOT NULL DEFAULT 0.0,"
"Usercount int(11) NOT NULL DEFAULT 0,"
"AttitudeCount int(11) NOT NULL DEFAULT 0,"
"TotalBoxOffice varchar(20) NOT NULL,"
"TodayBoxOffice varchar(20) NOT NULL,"
"Rank int(11) NOT NULL DEFAULT 0,"
"ShowDays int(11) NOT NULL DEFAULT 0,"
"isRelease int(11) NOT NULL"
""
#print 'CREATE TABLE IF NOT EXISTS %s(%s)' % (table_name, values)

self.cx.execute('CREATE TABLE IF NOT EXISTS %s(%s) ENGINE=InnoDB DEFAULT CHARSET=utf8' % (table_name, values))

def store_data(self, data):

if data is None:
return
self.datas.append(data)
if len(self.datas) > 10:
self.output_db('MTime')

def output_db(self, table_name):
for data in self.datas:
self.cx.execute("INSERT INTO MTime (MovieId,MovieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,"
"RStoryFinal,Usercount,AttitudeCount,TotalBoxOffice,TodayBoxOffice,Rank,ShowDays,isRelease) "
"VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",data)

self.datas.remove(data)

self.con.commit()
self.con.close()

def output_end(self):

if len(self.datas) > 0:
self.output_db('MTime')

self.cx.close()

爬虫调度器

# coding:utf-8
from UrlManager import UrlManager
from DataOutput import DataOutput
from HtmlDownloader import HtmlDownloader
from HtmlParser import HtmlParser
import time
class SpiderMan(object):

def __init__(self):

self.downloader = HtmlDownloader()
self.parser = HtmlParser()
self.output = DataOutput()

def crawl(self,root_url):

content = self.downloader.download(root_url)

urls = self.parser.parser_url(root_url,content)

for url in urls:

try:
t= time.strftime("%Y%m%d%H%M%S3282",time.localtime())
rank_url ="http://service.library.mtime.com/Movie.api?"
"Ajax_CallBack=true"
"&Ajax_CallBackType=Mtime.Library.Services"
"&Ajax_CallBackMethod=GetMovieOverviewRating"
"&Ajax_CrossDomain=1"
"&Ajax_RequestUrl=%s"
"&t=%s"
"&Ajax_CallBackArgument0=%s" %(url[0],t,url[1])

#print rank_url
#exit()
rank_content = self.downloader.download(rank_url)

data = self.parser.parser_json(rank_url,rank_content)

self.output.store_data(data)
except Exception,e:
print e
self.output.output_end()
print "Crawl finish"

if __name__ == '__main__':

spider = SpiderMan()
spider.crawl('http://theater.mtime.com/China_Beijing/')

查看全文

相关阅读:
CCCallFuncND的void指针的理解
 推荐个结合控件
 C#多线程多参数传递
 ASP.NET使用点聚WebOffice实现文档在线浏览
 mysql——插入、更新、删除数据（示例）
mysql——查询语句——单表查询——（示例）
mysql——查询语句——单表查询——（概念）
mysql——修改表名、修改字段名、修改字段数据类型、增加字段、删除字段、修改字段排列位置、修改存储引擎、删除表（概念）
mysql——单表查询——分组查询——示例
 python+selenium显示等待、隐式等待和强制等待的区别

原文地址：https://www.cnblogs.com/paulversion/p/8393842.html