zoukankan      html  css  js  c++  java
  • 收录及出图导出excel表

    # coding=utf-8
    #@auther:Mana_菜小刀
    import requests
    import queue
    import threading
    import xlrd
    import xlwt
    from lxml import etree
    from xlutils.copy import copy
    from requests.packages.urllib3.exceptions import InsecureRequestWarning
    
    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
    }
    
    myxls = xlwt.Workbook()
    sheet1 = myxls.add_sheet('收录search')
    lst_name = ['url', '收录/未收录', '图片']
    for i in range(len(lst_name)):
        sheet1.write(0, i, lst_name[i])
    myxls.save('result.xls')
    
    def log(*args,**kwargs):
        print(*args,**kwargs)
    
    
    class baiduSpider(threading.Thread):
        def __init__(self, queue_li, name):
            threading.Thread.__init__(self)
            self._queue = queue_li
            self._name = name
    
        def run(self):
            while not self._queue.empty():
                url = self._queue.get()
                try:
                    self.get_url(url)
                except Exception as e:
                    log(e)
                    pass
    
        def get_url(self,url):
            requests.adapters.DEFAULT_RETRIES = 5
            r = requests.session()
            r.keep_alive = False
            s = r.get(url=url, headers=headers)
            #log(s)
            xpather = etree.HTML(s.text)
    
            strs = xpather.xpath('//span[@class="nums_text"]//text()')
            imgs = xpather.xpath('//img[@class="c-img c-img6"]/@src')
            #log(strs, imgs)
            search_mo = ['收录','未收录']
            img_mo = ['有图','无图']
            url_mo = url.replace('http://www.baidu.com/s?wd=','')
    
            workbook = xlrd.open_workbook('result.xls', formatting_info=True)
            sheet = workbook.sheet_by_index(0)
            rowNum = sheet.nrows
            colNum = sheet.ncols
            newbook = copy(workbook)
            newsheet = newbook.get_sheet(0)
    
            if strs[0] != "百度为您找到相关结果约0个" and len(imgs) > 0:
                newsheet.write(rowNum,0,url_mo)
                newsheet.write(rowNum, 1, search_mo[0])
                newsheet.write(rowNum, 2, img_mo[0])
                log(search_mo[0],'',img_mo[0],'',url_mo)
                #newbook.save('result.xls')
            elif strs[0] != "百度为您找到相关结果约0个" and len(imgs) == 0:
                newsheet.write(rowNum, 0, url_mo)
                newsheet.write(rowNum, 1, search_mo[0])
                newsheet.write(rowNum, 2, img_mo[1])
                log(search_mo[0],'',img_mo[1],'',url_mo)
                #newbook.save('result.xls')
            else:
                newsheet.write(rowNum, 0, url_mo)
                newsheet.write(rowNum, 1, search_mo[1])
                newsheet.write(rowNum, 2, img_mo[1])
                log(search_mo[1],'',img_mo[1],'',url_mo)
            newbook.save('result.xls')
    
    def main():
        queue_li = queue.Queue()
        threads = []
        thread_count = 10
        myxls = xlwt.Workbook()
        sheet1 = myxls.add_sheet('IDF')
        '''把'urls'改成自己的txt文档名称:'''
        with open('urls', 'r', encoding='utf-8', errors="ignore") as f:
            content = f.read()
            urls = content.split('
    ')
        for url in urls:
            if len(url) > 0:
                url_search = url
                queue_li.put('http://www.baidu.com/s?wd={}'.format(url_search))
    
        for i in range(thread_count):
            spider = baiduSpider(queue_li, url_search)
            threads.append(spider)
    
        for i in threads:
            i.start()
    
        for i in threads:
            i.join()
    
    
        '''log("Mana好伟大!(^-^)V")'''
    
    if __name__ == '__main__':
        log("Mana好伟大!(^-^)V")
        main()
  • 相关阅读:
    Appium+python自动化13-native和webview切换【转载】
    Appium+python自动化12-appium元素定位【转载】
    Appium+python自动化11-adb必知必会的几个指令【转载】
    Appium+python自动化10-AVD 模拟器【转载】
    Appium+python自动化9-SDK Manager【转载】
    Appium+python自动化8-Appium Python API【转载】
    Appium+python自动化7-输入中文【转载】
    Appium+python自动化6-Remote远程控制【转载】
    Appium+python自动化5-Appium Inspector【转载】
    Centos-内核核心组成
  • 原文地址:https://www.cnblogs.com/mana66ccff/p/11184899.html
Copyright © 2011-2022 走看看