zoukankan      html  css  js  c++  java
  • 利用request、beautifulsoup、xml写多线程爬虫

    # -*- coding:UTF-8 -*-
    import requests,time
    from collections import OrderedDict
    import threading
    from bs4 import BeautifulSoup as bp
    
        
    
    t3 = time.time()
    ths = []  # 存放线程
    
    
    def get(num):
        dic = OrderedDict()
        n = str(num)
        data = {'basename':'BASENAME11',
        'where':'2PLDYDY1',
        'dbpage':n,
        'pagecount':'5',
        'order':'ORDER1,ORDER2',
        'orderbytype':'ASC',
        'searchList':'SEARCHLIST11',
        'isKz':'0',
        'id':'0.40519130290516947'}
        header1 = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36','Referrer':'http://txl.xxx.cn/xxx/center.do?path=txl_index'}
        page = requests.post('http://txl.xxx.cn/xxx/dBSearchForTxlAction.do',headers=header1,data=data)  # 自定义请求头,这些请求头内容是在浏览器上看到的
        t = page.text
        soup = bp(t,'xml')  #使用beautifulsoup解析xml文件,解析html时,将xml改为lxml
        all_body = soup.find_all('EmailResult')  #查找EmailResult标签包含的所有内容,生成一个列表
        for info in all_body:
            print(u'%s'%info.NAME.text.ljust(10,' '),info.FENJI.text.ljust(20,' '),info.SHOUJI.text.ljust(30),info.EMAIL.text.ljust(30),info.ZHIWU.text)  # 根据标签查找相应的text文本内容即可
    
    for num in range(75):
        t1 = threading.Thread(target=get, args=(num,))
        ths.append(t1)
    for t in ths:
        t.start()
    for ttt in ths:
        ttt.join()
    
    t4 = time.time()
    tt = t4 - t3
    print(tt)
  • 相关阅读:
    Word Puzzles [POJ 1204]
    set用法
    FOJ有奖月赛2012年11月
    BerDonalds
    POJ1469 匈牙利算法
    后缀数组
    ZOJ Monthly, January 2013
    算法导论<一>
    Yell Classico
    点聚 WebOffice 编辑辅助控件 WebOffice.OCX
  • 原文地址:https://www.cnblogs.com/wt11/p/6933629.html
Copyright © 2011-2022 走看看