zoukankan      html  css  js  c++  java
  • 豆瓣电影

    》》》基本方法

    1 from urllib import request
    2 
    3 response = request.urlopen("https://movie.douban.com/top250?start=25&filter=")
    4 content = response.read().decode('utf-8')
    5 print(content)
    View Code

    》》》代理服务器

    1 from urllib import request
    2 proxy_support = request.ProxyHandler({'http':'http://xx.xx.xx.xx:xx'})
    3 opener = request.build_opener(proxy_support,request.HTTPHandler)
    4 request.install_opener(opener)
    5 
    6 content = request.urlopen('https://movie.douban.com/').read().decode('utf-8')
    7 print(content)
    View Code

    》》》伪装成浏览器

     1 postdata = parse.urlencode({})
     2 headers ={
     3 
     4 }
     5 
     6 req = request.Request(
     7     url = "http//movie.douban.com    ",
     8     data = postdata,
     9 )
    10 headers = headers
    View Code

    》》》页面信息抓取

     1 from urllib import request
     2 
     3 class MoviesTop(object):
     4     def __init__(self):
     5         self.start = 0
     6         self.param = '&filter='
     7         self.headers = {'User-Agent':'Mozilla/5.0(Window NT 10.0;WOW64)'}
     8 
     9     def get_page(self):
    10         page_content = []
    11         try:
    12             while self.start <= 100:
    13                 url = 'https://movie.douban.com/top250?start=' + str(self.start)
    14                 req = request.Request(url,headers = self.headers)
    15                 response = request.urlopen(req)
    16                 page = response.read().decode('utf-8')
    17                 # print(page)
    18                 page_num = (self.start + 25)//25
    19 
    20                 print('抓取'+str(page_num) + '页数据')
    21                 self.start += 25
    22                 page_content.append(page)
    23             return page_content
    24         except request.URLError as e:
    25             if hasattr(e,'reason'):
    26                 print('抓取失败,原因是',e.reason)
    27 
    28 
    29     def main(self):
    30         print('开始抓取数据')
    31         self.get_page()
    32         print('数据抓取完毕')
    33 
    34 a=MoviesTop()
    35 a.main()
    View Code

    》》》提取相关信息

    1 import re
    2 # .*?
    3 html_text =' <p class="quote"><span class="inq">希望让人自由。</span></p>'
    4 reObj = re.compile('p class="quote"><span class="inq">(.*?)</span></p>')
    5 print(reObj)
    6 a = reObj.findall(html_text)
    7 print(a)
    View Code

    》》》完整代码并将数据存入txt文件中

     1 from urllib import request
     2 import re
     3 
     4 class MoviesTop(object):
     5     def __init__(self):
     6         self.start = 0
     7         self.param = '&filter='
     8         self.headers = {'User-Agent':'Mozilla/5.0(Window NT 10.0;WOW64)'}
     9         self.movies_list = []
    10         self.file_path = 'D:movies_spider2.xlsx'
    11 
    12     def get_page(self):
    13         try:
    14             # while self.start <= 75:
    15                 url = 'https://movie.douban.com/top250?start=' + str(self.start)
    16                 req = request.Request(url,headers = self.headers)
    17                 response = request.urlopen(req)
    18                 page = response.read().decode('utf-8')
    19                 page_num = (self.start + 25)//25
    20                 print('抓取'+str(page_num) + '页数据')
    21                 self.start += 25
    22                 return page
    23         except request.URLError as e:
    24             if hasattr(e,'reason'):
    25                 print('抓取失败,原因是',e.reason)
    26 
    27 
    28 
    29     def get_movies_info(self):
    30         # pattern = re.compile(u'<span.*?class="title">(.*?)</span>.*?'
    31         #                      +u'<em.*?class="">(.*?)</em>.*?'
    32         #                      +u'<span.*?class="title">(.*?)</span>.*?',re.S)
    33 
    34         pattern = re.compile(u'<em.*?class="">(.*?)</em>.*?'
    35                              + u'<span.*?class="title">(.*?)</span>.*?'
    36                              + u'<span.*?class="title">&nbsp;/&nbsp;(.*?)</span>.*?'
    37                              +u'导演:(.*?)&nbsp;&nbsp;&nbsp;.*?'
    38                              +u'主演:(.*?)<br>.*?'
    39                              +u'<span>(.*?)人评价</span>.*?'
    40                              +u'<span.*?class="inq">(.*?)</span>.*?', re.S)
    41 
    42         # pattern = re.compile( u'<em.*?class="">(.*?)</em>.*?'
    43         #                      + u'<span.*?class="title">(.*?)</span>.*?'
    44         #                      + u'p class="quote"><span class="inq">(.*?)</span></p>', re.S)#排名,名称
    45         print('>>>',pattern)
    46 
    47         while self.start <50:
    48             page = self.get_page()
    49             # print(page)
    50             movies = re.findall(pattern,page)
    51             # print(movies)
    52             for movie in movies:
    53                 print(movie)
    54                 self.movies_list.append([movie[0],movie[1],movie[2],movie[3],movie[4],movie[5],movie[6]])
    55                 print(movie[6])
    56 
    57 
    58     def write_text(self):
    59         print("开始写数据")
    60         file_top = open(self.file_path, 'w', encoding='utf-8')
    61         try:
    62             for movie in self.movies_list:
    63                 file_top.write('电影排名: ' +  movie[0] + '
    ')
    64                 file_top.write('电影名称:' +  movie[1] +'
    ')
    65                 file_top.write('电影别名:' + movie[2] + '
    ')
    66                 file_top.write('导演姓名:' + movie[3] + '
    ')
    67                 file_top.write('主演姓名:' + movie[4] + '
    ')
    68                 file_top.write('参评人数:' + movie[5] + '
    ')
    69                 file_top.write('剪短影评:' + movie[6] + '
    ')
    70 
    71             print('写入成功')
    72         except Exception as e:
    73             print(e)
    74         finally:
    75             file_top.close()
    76 
    77     def main(self):
    78 
    79         print('开始抓取数据')
    80         self.get_movies_info()
    81         self.write_text()
    82         print('数据抓取完毕')
    83 
    84 
    85 a = MoviesTop()
    86 a.main()
    View Code

     >>>完整代码并将数据存入数据库

     1 from urllib import request
     2 import re, MySQLdb
     3 
     4 
     5 class MoviesTop(object):
     6     def __init__(self):
     7         self.start = 0
     8         self.param = '&filter='
     9         self.headers = {'User-Agent': 'Mozilla/5.0(Window NT 10.0;WOW64)'}
    10         self.movies_list = []
    11         # self.file_path = 'D:movies_spider2.txt'
    12 
    13     def get_page(self):
    14         try:
    15             # while self.start <= 75:
    16             url = 'https://movie.douban.com/top250?start=' + str(self.start)
    17             req = request.Request(url, headers=self.headers)
    18             response = request.urlopen(req)
    19             page = response.read().decode('utf-8')
    20             page_num = (self.start + 25) // 25
    21             print('抓取' + str(page_num) + '页数据')
    22             self.start += 25
    23             return page
    24         except request.URLError as e:
    25             if hasattr(e, 'reason'):
    26                 print('抓取失败,原因是', e.reason)
    27 
    28 
    29 def get_movies_info(self):
    30     # pattern = re.compile(u'<span.*?class="title">(.*?)</span>.*?'
    31     #                      +u'<em.*?class="">(.*?)</em>.*?'
    32     #                      +u'<span.*?class="title">(.*?)</span>.*?',re.S)
    33 
    34     pattern = re.compile(u'<em.*?class="">(.*?)</em>.*?'
    35                          + u'<span.*?class="title">(.*?)</span>.*?'
    36                          + u'<span.*?class="title">&nbsp;/&nbsp;(.*?)</span>.*?', re.S)
    37     # print('>>>',pattern)
    38 
    39     while self.start < 25:
    40         page = self.get_page()
    41         # print(page)
    42         movies = re.findall(pattern, page)
    43         print(movies[1])
    44         print(movies[1][2])
    45         for movie in movies:
    46             print(movie)
    47             self.movies_list.append([movie[0], movie[1], movie[2]])
    48             # print(self.movies_list)
    49 
    50 
    51 def insert_into_sql(self):
    52     # conn=MySQLdb.connect(host='localhost',port='3306',user='root',password='123666',db='test')
    53     conn = MySQLdb.connect(host='localhost', port=3306, user='root', passwd='123666', db='test', charset='utf8')
    54     cur = conn.cursor()
    55 
    56     try:
    57         for movie in self.movies_list:
    58             a = movie[0]
    59             print(a)
    60             b = movie[1]
    61             print(b)
    62             c = movie[2]
    63             print(c)
    64             # insert_str = "insert into movies01 values( 2,'ghj', 'GHJ')"
    65             insert_str = "insert into m1 (moviesRank,MoviesName,OtherName) values('%s','%s','%s')" % (
    66                 movie[0], movie[1], movie[2])
    67             # insert_sql =insert_str%()
    68             cur.execute(insert_str)
    69             # cur.close()
    70             conn.commit()
    71             # conn.close()
    72             print('>>>>')
    73     except Exception as e:
    74         print(e)
    75 
    76 
    77 def main(self):
    78     print('开始抓取数据')
    79     self.get_movies_info()
    80     self.insert_into_sql()
    81     print('数据抓取完毕')
    82 
    83 
    84 b = MoviesTop()
    85 b.main()
    View Code
  • 相关阅读:
    利用bat合并两个hex文件
    Laravel中使用自己的类库三种方式
    Carbon 的 diffForHumans 方法
    5 个 Laravel Eloquent 小技巧
    laravel 批量更新
    laravel 打印sql语句
    PHP获取客户端的IP地址
    PHP跨域访问
    解析URL参数
    转: 雅虎35条优化黄金守则
  • 原文地址:https://www.cnblogs.com/cerofang/p/8087965.html
Copyright © 2011-2022 走看看