zoukankan      html  css  js  c++  java
  • 正则练习

    
    
     1 import re
     2 from urllib.request import urlopen  # 打开一个连接  读取源代码
     3 import ssl
     4 
     5 # 干掉数字签名证书
     6 ssl._create_default_https_context = ssl._create_default_https_context
     7 
     8 
     9 def getPage(url):
    10     response = urlopen(url)  # 和网页链接
    11     return response.read().decode("utf-8")  # 返回正常的页面源代码,一大堆html
    12 
    13 
    14 def parsePage(s):  # s是页面源代码
    15     ret = re.findall('<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>d+).*?' +
    16                      '<span class="title">(?P<title>.*?)</span>' +
    17                      '.*?<span class="rating_num".*?>(?P<rating_num>.*?)</span>.*?' +
    18                      '<span>(?P<comment_num>.*?)评价</span>', s, re.S)
    19     return ret
    20 
    21 
    22 def main(num):
    23     url = 'https://movie.douban.com/top250?start=%s&filter=' % num
    24     response_html = getPage(url)  # response_html 是html页面源码
    25     ret = parsePage(response_html)
    26     print(ret)
    27 
    28 
    29 count = 0
    30 for i in range(1):  # 1031     main(count)
    32     count += 25
    View Code

    写入文件版
     1 import re
     2 from urllib.request import urlopen
     3 import ssl
     4 
     5 ssl._create_default_https_context = ssl._create_default_https_context
     6 
     7 
     8 def getPage(url):
     9     response = urlopen(url)
    10     return response.read().decode("utf-8")
    11 
    12 
    13 def parsePage(s):
    14     com = re.compile(
    15         '<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>d+).*?' +
    16         '<span class="title">(?P<title>.*?)</span>' +
    17         '.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>' +
    18         '(?P<comment_num>.*?)评价</span>', re.S)
    19     ret = com.finditer(s)
    20     for i in ret:
    21         yield {
    22             "id": i.group("id"),
    23             "title": i.group("title"),
    24             "rating_num": i.group("rating_num"),
    25             "comment_num": i.group("comment_num"),
    26         }
    27 
    28 
    29 def main(num):
    30     url = 'https://movie.douban.com/top250?start=%s&filter=' % num
    31     response_html = getPage(url)
    32     ret = parsePage(response_html)
    33     f = open("move", "a", encoding="utf-8")
    34 
    35     for obj in ret:
    36         # print(obj)
    37         data = str(obj)
    38         f.write(data + "
    ")
    39 
    40 
    41 count = 0
    42 for i in range(5):
    43     main(count)
    View Code


  • 相关阅读:
    [DDCTF 2019]homebrew event loop
    [极客大挑战 2019]FinalSQL
    $[HAOI2008]$硬币购物
    $2018/8/19 = Day5$学习笔记 + 杂题整理
    $2018/8/16 = Day2$学习笔记$+$杂题整理
    [NOIp2009] $Hankson$の趣味题
    2018清北学堂夏日培训游记
    2.数组的声明和创建
    1.什么是数组?
    15.递归
  • 原文地址:https://www.cnblogs.com/Majintao/p/9778462.html
Copyright © 2011-2022 走看看