zoukankan html css js c++ java

收录及出图导出excel表

# coding=utf-8
#@auther:Mana_菜小刀
import requests
import queue
import threading
import xlrd
import xlwt
from lxml import etree
from xlutils.copy import copy
from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
}

myxls = xlwt.Workbook()
sheet1 = myxls.add_sheet('收录search')
lst_name = ['url', '收录/未收录', '图片']
for i in range(len(lst_name)):
    sheet1.write(0, i, lst_name[i])
myxls.save('result.xls')

def log(*args,**kwargs):
    print(*args,**kwargs)


class baiduSpider(threading.Thread):
    def __init__(self, queue_li, name):
        threading.Thread.__init__(self)
        self._queue = queue_li
        self._name = name

    def run(self):
        while not self._queue.empty():
            url = self._queue.get()
            try:
                self.get_url(url)
            except Exception as e:
                log(e)
                pass

    def get_url(self,url):
        requests.adapters.DEFAULT_RETRIES = 5
        r = requests.session()
        r.keep_alive = False
        s = r.get(url=url, headers=headers)
        #log(s)
        xpather = etree.HTML(s.text)

        strs = xpather.xpath('//span[@class="nums_text"]//text()')
        imgs = xpather.xpath('//img[@class="c-img c-img6"]/@src')
        #log(strs, imgs)
        search_mo = ['收录','未收录']
        img_mo = ['有图','无图']
        url_mo = url.replace('http://www.baidu.com/s?wd=','')

        workbook = xlrd.open_workbook('result.xls', formatting_info=True)
        sheet = workbook.sheet_by_index(0)
        rowNum = sheet.nrows
        colNum = sheet.ncols
        newbook = copy(workbook)
        newsheet = newbook.get_sheet(0)

        if strs[0] != "百度为您找到相关结果约0个" and len(imgs) > 0:
            newsheet.write(rowNum,0,url_mo)
            newsheet.write(rowNum, 1, search_mo[0])
            newsheet.write(rowNum, 2, img_mo[0])
            log(search_mo[0],'丨',img_mo[0],'丨',url_mo)
            #newbook.save('result.xls')
        elif strs[0] != "百度为您找到相关结果约0个" and len(imgs) == 0:
            newsheet.write(rowNum, 0, url_mo)
            newsheet.write(rowNum, 1, search_mo[0])
            newsheet.write(rowNum, 2, img_mo[1])
            log(search_mo[0],'丨',img_mo[1],'丨',url_mo)
            #newbook.save('result.xls')
        else:
            newsheet.write(rowNum, 0, url_mo)
            newsheet.write(rowNum, 1, search_mo[1])
            newsheet.write(rowNum, 2, img_mo[1])
            log(search_mo[1],'丨',img_mo[1],'丨',url_mo)
        newbook.save('result.xls')

def main():
    queue_li = queue.Queue()
    threads = []
    thread_count = 10
    myxls = xlwt.Workbook()
    sheet1 = myxls.add_sheet('IDF')
    '''把'urls'改成自己的txt文档名称：'''
    with open('urls', 'r', encoding='utf-8', errors="ignore") as f:
        content = f.read()
        urls = content.split('
')
    for url in urls:
        if len(url) > 0:
            url_search = url
            queue_li.put('http://www.baidu.com/s?wd={}'.format(url_search))

    for i in range(thread_count):
        spider = baiduSpider(queue_li, url_search)
        threads.append(spider)

    for i in threads:
        i.start()

    for i in threads:
        i.join()


    '''log("Mana好伟大!(＾－＾)V")'''

if __name__ == '__main__':
    log("Mana好伟大!(＾－＾)V")
    main()

查看全文

相关阅读:
CentOS
Ubuntu
Ubuntu
Ubuntu
2020-10-12 分享——Bigdata & ML Development WITH SCALA/python & SPARK （待续）
postgresql 自增列初始值设置
 白话布隆过滤器BloomFilter（转发）
Typora ---一款简洁的Markdown编辑器
 postman —— API测试工具 && HTTP请求（filddler 同类）
K8S —— 学习资料（待看）

原文地址：https://www.cnblogs.com/mana66ccff/p/11184899.html