zoukankan      html  css  js  c++  java
  • 化工最新采集3——多线程

    sklearn实战-乳腺癌细胞数据挖掘(博主亲自录制视频)

    多线程采集,1秒搞定

    # -*- coding: utf-8 -*-
    """
    Created on Tue May 17 16:26:31 2016
    采集下来excel文件小于2kb的有问题
    
    @author: Administrator
    """
    
      
    import requests,bs4,csv,time,random,os,threading
      
    #存放所有二级网址
    fileName='combinedFile.csv'
    #存放二级网址目录
    bad_urls=[]
    site_hubei="http://china.guidechem.com/suppliers/list_catid-21_area-%E6%B9%96%E5%8C%97"
    site_guangdong="http://china.guidechem.com/suppliers/list_catid-21_area-广东"
    site_shanghai="http://china.guidechem.com/suppliers/list_catid-21_area-%E4%B8%8A%E6%B5%B7"
    site_shanxi="http://china.guidechem.com/suppliers/list_catid-21_area-陕西"
    pages_hubei=31
    pages_guangdong=21
    pages_shanghai=34
    pages_shanxi=15
    start_page=0
     
    def Get_sites(site,pages):
        list_pages=[]
        for page in range(1,pages+1):
            thePage=site+"-"+"p"+str(page)+".html"
            list_pages.append(thePage)
         
        return list_pages
    
    
    def Get_company_name(elems,i):
        elems_company_name=elems[i].select(".dblue")
        if len(elems_company_name)==0:   #如果找不到元素,则空起
            company_name=""
            return company_name
        company_name=elems_company_name[0].text
        return company_name
    
    def Get_main_product(elems,i):
        elems_main_product=elems[i].select("li")
        if len(elems_main_product)==0:   #如果找不到元素,则空起
            main_product=""
            return main_product
        main_product=elems_main_product[1].text.strip("
    ")
        return main_product
        
    def Get_phone_address(elems,i):
        elems_contact=elems[i].select(".site_l")
        content_contact=elems_contact[0].text
        content_contact1=content_contact.strip("
    
    	
    ")
        content_contact2=content_contact1.strip("
    ")
        list_content_contact=content_contact2.split("
    
    ")
        #有时候信息会缺失,用正则表达式筛选text内容
        if len(list_content_contact)==2:
            phone=list_content_contact[0]
            address=list_content_contact[1]
        if len(list_content_contact)==1:
            content=list_content_contact[0]
            if "地址" in content:
                address=content
                phone=""
            if "电话" in content:
                phone=content
                address=""
        phone_address=(phone,address)
        return phone_address
     
    #获取每一页20个公司信息存储在list_rows_information
    def Get_page_information(url):
        #每一页20个公司信息存储在list_rows_information里面
        list_rows_information=[]
        res=requests.get(url)
        time.sleep(2)
        soup=bs4.BeautifulSoup(res.text,"lxml")
        time.sleep(2)
        #综合信息
        elems=soup.select(".clist_list_content_r")
        num=len(elems)
        for i in range(num):
            try:
                #公司名称
                company_name=Get_company_name(elems,i)
                
                #主要产品
                main_product=Get_main_product(elems,i)
                
                #联系方式
                phone_address=Get_phone_address(elems,i)
                phone=phone_address[0]
                address=phone_address[1]
                
                list_rows_information.append([company_name,main_product,phone,address])
            except:
                print("error at:",i)
                continue
        return list_rows_information
     
     
      
    #把一页内容写入csv文档 ,list_tableContent为二维列表[[a],[b],[c]]
    def Write_table_to_csv(url):
        list_tableContent=Get_page_information(url)
        fileName=os.path.splitext(url)[0][-3:]+".csv"  #  1.csv
        '''
        fileName=os.path.splitext(url)[0][-3:]+".csv"
        fileName
        Out[27]: 'p12.csv'
        '''
        #对列表格式修改,字符串写入的格式不对
        file=open(fileName,'w',newline='')
        writer1=csv.writer(file)
        writer1.writerows(list_tableContent)
        file.close()
          
    #写入所有文件
    def Write_allTables_to_csvs(list_pages):
        for i in range(start_page,pages_shanghai):
            try:
                Write_table_to_csv(i)
                time.sleep(random.randint(30,31))
            except:
                print("error at:",i)
                continue
            
    #获取截取数
    def Step(urls_list):
        step=len(urls_list)/15.0
        step=int(round(step,0))
        return step
            
    #采集某范围网址的公司数据
    def download_range(start,end):
        urls_list_range1=list_pages[start:end]
        for url in urls_list_range1:
            try:
                Write_table_to_csv(url)
            except:
                bad_urls.append(url)
                continue
        #print("well Done")        
            
    #主函数
    list_pages=Get_sites(site_shanxi,pages_shanxi)
    step=Step(list_pages)
    #生产所有csv文件,单线程采集        
    #Write_allTables_to_csvs(list_pages)
    
    
    downloadThreads = [] # a list of all the Thread objects
    for i in range(0, len(list_pages), step): # loops 14 times, creates 14 threads
        downloadThread = threading.Thread(target=download_range, args=(i, i +step))
        downloadThreads.append(downloadThread)
        downloadThread.start()
     
    # Wait for all threads to end.
    for downloadThread in downloadThreads:
        downloadThread.join()
    print('Done.')
    
    
    '''
    测试
    
    #downloadThread = threading.Thread(target=download_range, args=(10, 12))
    #downloadThread.start()
    
    downloadThread = threading.Thread(target=download_range, args=(12, 14))
    downloadThread.start()
    
    
    downloadThread = threading.Thread(target=download_range, args=(14, 16))
    downloadThread.start()
    i=3
    res=requests.get(list_pages[i])
    soup=bs4.BeautifulSoup(res.text,"lxml")
    elems=soup.select(".clist_list_content_r")
    #联系方式
    elems_contact=elems[2].select(".site_l")
    content_contact=elems_contact[0].text
    content_contact1=content_contact.strip("
    
    	
    ")
    content_contact2=content_contact1.strip("
    ")
    list_content_contact=content_contact2.split("
    
    ")
    
    #有时候信息会缺失,用正则表达式筛选text内容
    if len(list_content_contact)==2:
        phone=list_content_contact[0]
        address=list_content_contact[1]
    if len(list_content_contact)==1:
        content=list_content_contact[0]
        if "地址" in content:
            address=content
            phone=[]
        if "电话" in content:
            phone=content
            address=[]
    '''
    

     

     

  • 相关阅读:
    poj 3068 Bridge Across Islands
    XidianOJ 1086 Flappy v8
    XidianOJ 1036 分配宝藏
    XidianOJ 1090 爬树的V8
    XidianOJ 1088 AK后的V8
    XidianOJ 1062 Black King Bar
    XidianOJ 1091 看Dota视频的V8
    XidianOJ 1098 突击数论前的xry111
    XidianOJ 1019 自然数的秘密
    XidianOJ 1109 Too Naive
  • 原文地址:https://www.cnblogs.com/webRobot/p/5505125.html
Copyright © 2011-2022 走看看