zoukankan      html  css  js  c++  java
  • Python爬虫(七)

    源码:

     1 import requests
     2 import re
     3 from my_mysql import MysqlConnect
     4 
     5 # 获取详情页链接和电影名称
     6 def get_urls(page):
     7     url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'.format(page)
     8     response = requests.get(url)
     9     response.encoding = 'gbk'
    10     # print(res)
    11     pat = r'<a href="(.*?)" class="ulink">(.*?)</a>'
    12     res = re.findall(pat, response.text)
    13     # print(res)
    14     return res
    15 
    16 # 获取磁力链接
    17 def get_links(url):
    18     response = requests.get(url)
    19     response.encoding = 'gbk'
    20     html = response.text
    21     # print(res)
    22     pat = r'href="(magnet.*?)"'
    23     res = re.search(pat, html)
    24     magnet = res.group(1)
    25     pat = r'href="(ftp.*?)"'
    26     res = re.search(pat, html)
    27     ftp = res.group(1)
    28     return magnet,ftp
    29 
    30 if __name__ == '__main__':
    31     mc = MysqlConnect('127.0.0.1', 'root', '123456', 'homework')
    32     for page in range(1,4):
    33         res = get_urls(page)
    34         for url, name in res:
    35             url = 'http://www.dytt8.net/' + url
    36             movie_tuple = get_links(url)
    37             sql = 'insert into dytt(id,name,magnet,ftp) values(null,{},{},{})'.format(repr(name),repr(movie_tuple[0]),repr(movie_tuple[1]))
    38             print(sql)
    39             mc.exec(sql)
  • 相关阅读:
    【PAT】1001 害死人不偿命的(3n+1)猜想(动态更新)
    文件指令集
    近距离接触电脑
    文件管理
    文件写作方法
    文件读取方法
    打开文件的逻辑
    话术库
    max的逻辑
    抽象化指令
  • 原文地址:https://www.cnblogs.com/zhxd-python/p/9501317.html
Copyright © 2011-2022 走看看