zoukankan      html  css  js  c++  java
  • python3----练习题(爬取电影天堂资源,大学排名,淘宝商品比价)

     1 import requests
     2 import re
     3 
     4 url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html'
     5 for n in range(1, 2):
     6     new_url = url.format(n)
     7     html_1 = requests.get(new_url)
     8     html_1.encoding = 'gb2312'
     9     detil_list = re.findall('<a href="(.*?)" class="ulink">', html_1.text)
    10 
    11     for m in detil_list:
    12         b_url = 'http://www.ygdy8.net' + m
    13         html_2 = requests.get(b_url)
    14         html_2.encoding = 'gb2312'
    15         ftp = re.findall('<a href="(.*?)">.*?</a></td>', html_2.text)
    16         with open('tddy.txt', 'a', encoding='utf-8') as f:
    17             f.write(ftp[0] + '
    ')

    大学排名练习

     1 import bs4
     2 import requests
     3 from bs4 import BeautifulSoup
     4 
     5 def get_html_text(url):
     6     try:
     7         r = requests.get(url, timeout=20)
     8         r.raise_for_status()
     9         r.encoding = r.apparent_encoding
    10         return r.text
    11     except:
    12         return " "
    13 
    14 
    15 def fill_univ_list(ulist, html):
    16     soup = BeautifulSoup(html, "html.parser")
    17     for tr in soup.find('tbody').children:
    18         if isinstance(tr, bs4.element.Tag):   # 判断类型
    19             tds = tr('td')
    20             ulist.append([tds[0].string, tds[1].string, tds[3].string])
    21 
    22 
    23 def print_univ_list(ulist, num):
    24     tplt = "{0:^10}	{1:{3}^10}	{2:^10}"
    25     print(tplt.format("排名", "学校名称", "总分", chr(12288)))
    26     for i in range(num):
    27         u = ulist[i]
    28         print(tplt.format(u[0], u[1], u[2], chr(12288)))
    29 
    30 
    31 def main():
    32     uinfo = []
    33     url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html'
    34     html = get_html_text(url)
    35     fill_univ_list(uinfo, html)
    36     print_univ_list(uinfo, 20)
    37 
    38 
    39 main()

     淘宝商品比价:

     1 import requests
     2 import re
     3 
     4 def get_html_text(url):
     5     try:
     6         r = requests.get(url, timeout=30)
     7         r.raise_for_status()
     8         r.encoding = 'utf-8'
     9         return r.text
    10     except:
    11         return ""
    12 
    13 
    14 def parse_page(ilt, html):
    15     try:
    16         plt = re.findall(r'"view_price":"[d.]*"', html)
    17         tlt = re.findall(r'"raw_title":".*?"', html)
    18         for i in range(len(plt)):
    19             price = eval(plt[i].split(':')[1])
    20             title = eval(tlt[i].split(':')[1])
    21             ilt.append([price, title])
    22     except:
    23         print("")
    24 
    25 def print_goods_list(ilt):
    26     tplt = "{:4}	{:8}	{:16}"
    27     print(tplt.format("序号", "价格", "商品名称"))
    28     count = 0
    29     for g in ilt:
    30         count = count + 1
    31         print(tplt.format(count, g[0], g[1]))
    32 
    33 def main():
    34     goods = '减肥餐'
    35     depth = 2
    36     start_url = 'http://s.taobao.com/search?q=' + goods
    37     info_list = []
    38     for i in range(depth):
    39         try:
    40             url = start_url + '&s=' + str(44*i)
    41             html = get_html_text(url)
    42             parse_page(info_list, html)
    43         except:
    44             continue
    45     print_goods_list(info_list)

     股票数据:

     1 import re
     2 import traceback
     3 
     4 import requests
     5 import sys
     6 from bs4 import BeautifulSoup
     7 
     8 
     9 def get_html_text(url, code='utf-8'):
    10     headers ={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
    11     try:
    12         r = requests.get(url, timeout=20, headers=headers)
    13         r.raise_for_status()
    14         r.encoding = code
    15         return r.text
    16     except:
    17         return ""
    18 
    19 def get_stock_list(lst, stock_url):
    20     html = get_html_text(stock_url, 'GB2312')
    21     soup = BeautifulSoup(html, 'html.parser')
    22     a = soup.find_all('a')
    23     for i in a:
    24         stock_code = re.findall(r'[s][hz]d{6}', str(i))
    25         if len(stock_code) != 0:
    26             lst.append(stock_code)
    27 
    28 
    29 def get_stock_info(lst, stock_url, fpath):
    30     count = 0
    31     for stock in lst:
    32         url = stock_url + stock[0] + '.html'
    33         print(url)
    34         html = get_html_text(url)
    35         try:
    36             if html == "":
    37                 continue
    38             info_dict = {}
    39             soup = BeautifulSoup(html, 'html.parser')
    40             stock_info = soup.find('div', attrs={'class': 'stock-bets'})
    41             info_dict.update({'股票名称': stock_info.text.split()[0]})
    42 
    43             key_list = stock_info.find_all('dt')
    44             value_list = stock_info.find_all('dd')
    45             for i in range(len(key_list)):
    46                 key = key_list[i].text
    47                 info_dict[key] = value_list[i].text
    48 
    49             with open(fpath, 'a', encoding='utf-8') as f:
    50                 f.write(str(info_dict) + '
    ')
    51                 count = count + 1
    52                 print("
    当前进度: {:.2f}%".format(count*100/len(lst), end=""))
    53         except:
    54             traceback.print_exc(file=sys.stdout)
    55             count = count + 1
    56             print("
    当前进度: {:.2f}%".format(count * 100 / len(lst), end=""))
    57             continue
    58 
    59 def main():
    60     stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
    61     stock_info_url = 'http://gupiao.baidu.com/stock/'
    62     output_file = 'D:/BaiduStockInfo.txt'
    63     slist = []
    64     get_stock_list(slist, stock_list_url)
    65     get_stock_info(slist, stock_info_url, output_file)
  • 相关阅读:
    39 多线程(十一)——ThreadLocal
    38 多线程(十)——volatile 数据同步
    Linux内存描述之内存区域zone–Linux内存管理(三)
    Linux内存描述之内存节点node–Linux内存管理(二)
    Linux内存描述之概述--Linux内存管理(一)
    服务器体系(SMP, NUMA, MPP)与共享存储器架构(UMA和NUMA)
    乐观
    乱七八糟的学习资料汇总(python3.x,pyqt,svn,git)
    Linux学习资料网站汇总链接(持续更新ing)
    浅析十大常见排序(含C++代码)
  • 原文地址:https://www.cnblogs.com/jonm/p/8353301.html
Copyright © 2011-2022 走看看