zoukankan      html  css  js  c++  java
  • 爬虫初识(爬取dytt电影列表及下载地址)

    import re
    from  urllib.request import urlopen
    def getPage(url):
        response=urlopen(url)
        return response.read().decode('gbk',errors='ignore')
    def parsePage(s):
        com=re.compile(r'<td height="26">.*?<b>.*?<a href="(?P<url_name>.*?)" class="ulink">.*?',re.S)
        ret=com.finditer(s)
        for i  in  ret :
            return "http://www.dytt8.net"+i.group("url_name")
    def parsePage1(s):
        com=re.compile(r'<div id="Zoom">.*?译.*?名(?P<name>.*?)<br />◎片.*?名(?P<pianname>.*?)<br />.*?◎导.*?演(?P<daoyan>.*?)<br />'+
    '◎主.*?演(?P<zhuyan>.*?)<br /><br />◎简.*?介.*?<td.*?><a href="(?P<xiazaidizhi>.*?)">',re.S)
        ret1=com.finditer(s)
        # print('****************************************************************')
        for i  in  ret1 :
            yield {"yiming":(re.sub("[u3000]", "",i.group('name'))),
                    "pianming":re.sub("[u3000]", "",i.group("pianname")),
                    "daoyan":re.sub("[u3000]", "",i.group("daoyan")),
                    "zhuyan":re.sub("[u3000]", "",i.group("zhuyan")),
                    "xiazaidizhi":re.sub("[u3000]", "",i.group("xiazaidizhi"))}
    def main(num):
        url="http://www.dytt8.net/html/gndy/dyzz/list_23_%s.html" % num
        response_html=getPage(url)
        xiangqing=parsePage(response_html)
        response1_html = getPage(xiangqing)
        ret=parsePage1(response1_html)
        f = open("move_list", "a", encoding="utf8")
        for obj in ret:
            print(obj)
            data = str(obj)
            f.write(data + "
    ")
    for i in range(1,181):
        main(i)
  • 相关阅读:
    UVA-448
    算法提高-集合选取
    算法训练Maze
    UVA-10061
    树状数组
    前缀和
    【UVA
    统计Linux下的CPU状态信息
    Android_内部文件读取
    Android打开/data/目录以及导出文件
  • 原文地址:https://www.cnblogs.com/zhoushibin-1/p/9780285.html
Copyright © 2011-2022 走看看