zoukankan      html  css  js  c++  java
  • 柯林斯

    原文地址:# https://www.cnblogs.com/dylan9/p/9207366.html

    python代码:

     1 # 关于线程以及进程的使用
     2 #文件名:sample.py
     3 import time
     4 
     5 import requests
     6 from lxml import etree
     7 from multiprocessing.dummy import Pool
     8 headers = {
     9     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
    10 }
    11 
    12 # url = "https://www.collinsdictionary.com/zh/browse/english/"
    13 #
    14 # page_text = requests.get(url=url, headers=headers).text
    15 #
    16 # tree = etree.HTML(page_text)
    17 #
    18 # li_list = tree.xpath("//ul[@class='bLtr']/li/a/@href")[1:]
    19 pool = Pool(20)
    20 
    21 li_list = ['https://www.collinsdictionary.com/zh/browse/english/words-starting-with-a', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-b', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-c', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-d', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-e', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-f', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-g', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-h', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-i', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-j', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-k', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-l', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-m', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-n', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-o', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-p', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-q', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-r', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-s', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-t', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-u', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-v', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-w', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-x', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-y', 'https://www.collinsdictionary.com/zh/browse/english/words-starting-with-z']
    22 
    23 # li_list = ["https://www.collinsdictionary.com/zh/browse/english/words-starting-with-a"]
    24 
    25 deep_url_list = []
    26 
    27 start = time.time()
    28 
    29 def get_urls(url):
    30     page_text2 = requests.get(url=url, headers=headers).text
    31     tree2 = etree.HTML(page_text2)
    32     url_list = tree2.xpath("//ul[@class='columns2 bL']/li/a/@href")
    33     deep_url_list.extend(url_list)
    34 
    35 
    36 def get_data(url):
    37     page_text3 = requests.get(url=url, headers=headers).text
    38     tree3 = etree.HTML(page_text3)
    39     data_li_list = tree3.xpath("//ul[@class='columns2 bL']/li")
    40     for li in data_li_list:
    41         data = li.xpath('./a/text()')[0]
    42         with open("word2.txt", "a", encoding="utf-8") as f:
    43             f.write(data + '
    ')
    44 
    45 
    46 pool.map(get_urls, li_list)
    47 result = pool.map_async(get_data, deep_url_list)
    48 result.wait()
    49 print("执行完毕")
    50 print("耗时:", time.time()-start)

    windown下安装python,安装pip、安装requests包,结果没有用

     1 import re
     2 import random
     3 import requests
     4 
     5 from bs4 import BeautifulSoup
     6 from concurrent.futures import ThreadPoolExecutor
     7 from multiprocessing import cpu_count
     8 
     9 # ------------------------- 制作英文词典 --------------------------------------
    10 
    11 rex = re.compile(r'[-&()/.]+')
    12 
    13 
    14 def bar(url):
    15     response = requests.get(url=url)
    16     soup = BeautifulSoup(response.text, 'html.parser')
    17     ul_obj = soup.find(name='ul', attrs={'class', 'columns2 browse-list'})
    18     return ul_obj.find_all(name='a')
    19 
    20 
    21 def worker(url):
    22     """
    23         拿到具体的连接,https://www.collinsdictionary.com/browse/english/words-starting-with-a
    24         如上链接,是所有以a开头的单词集合
    25     """
    26     a_list = bar(url='https://www.collinsdictionary.com/browse/english/words-starting-with-{}'.format(url[0]))
    27     for item in a_list:
    28         for i in bar(item.get('href')):
    29             res = i.text
    30             if not re.findall(rex, res) and len(res) > 2:
    31                 print(res)
    32                 url[1].write('{}
    '.format(res))
    33 
    34 
    35 def spider_collins():
    36     """
    37         爬取柯林斯网站所有的单词,链接深度共三层,
    38         第一层获取24个字母的连接,
    39         第二层获取以字母开头的所有短语或单词,
    40         第三层,就是具体的一个个单词了
    41     """
    42     f = open('w.txt', 'a', encoding='utf8')
    43     t = ThreadPoolExecutor(cpu_count() * 5)
    44     for i in range(ord('a'), ord('z') + 1):  # 97 ~ 122
    45         t.submit(worker, (chr(i), f))
    46         # break
    47     t.shutdown()
    48     f.close()

    差不多一个意思吧,还要消化下

    pip install requests 

    等待系统自动加载安装。 

     
  • 相关阅读:
    Django中获取参数(路径,查询,请求头,请求体)
    正则表达式基础、原理
    每日总结【2020/02/12】
    【家庭记账本】Android开发(初稿)
    每日总结【2020/02/11】
    【家庭记账本】Android开发日记(九)
    【家庭记账本】Android开发日记(八)
    【家庭记账本】Android开发(提交稿件)
    每日总结【2020/02/09】
    《构建之法》阅读笔记(三)
  • 原文地址:https://www.cnblogs.com/guochaoxxl/p/12848973.html
Copyright © 2011-2022 走看看