zoukankan      html  css  js  c++  java
  • 盖得化工最新采集2

    Python爬虫视频教程零基础小白到scrapy爬虫高手-轻松入门

    https://item.taobao.com/item.htm?spm=a1z38n.10677092.0.0.482434a6EmUbbW&id=56456460486

    需要切换ip,采集过多会被封杀

    # -*- coding: utf-8 -*-
    """
    Created on Tue May 17 16:26:31 2016
    
    @author: Administrator
    """
    
      
    import requests,bs4,csv,time,random,os
      
    #存放所有二级网址
    fileName='combinedFile.csv'
    #存放二级网址目录
     
    site_hubei="http://china.guidechem.com/suppliers/list_catid-21_area-%E6%B9%96%E5%8C%97"
    pages_hubei=31
     
    def Get_sites(site,pages):
        list_pages=[]
        for page in range(1,pages+1):
            thePage=site+"-"+"p"+str(page)+".html"
            list_pages.append(thePage)
         
        return list_pages
    
    
    def Get_company_name(elems,i):
        elems_company_name=elems[i].select(".dblue")
        if len(elems_company_name)==0:   #如果找不到元素,则空起
            company_name=""
            return company_name
        company_name=elems_company_name[0].text
        return company_name
    
    def Get_main_product(elems,i):
        elems_main_product=elems[i].select("li")
        if len(elems_main_product)==0:   #如果找不到元素,则空起
            main_product=""
            return main_product
        main_product=elems_main_product[1].text.strip("
    ")
        return main_product
        
    def Get_phone_address(elems,i):
        elems_contact=elems[i].select(".site_l")
        content_contact=elems_contact[0].text
        content_contact1=content_contact.strip("
    
    	
    ")
        content_contact2=content_contact1.strip("
    ")
        list_content_contact=content_contact2.split("
    
    ")
        #有时候信息会缺失,用正则表达式筛选text内容
        if len(list_content_contact)==2:
            phone=list_content_contact[0]
            address=list_content_contact[1]
        if len(list_content_contact)==1:
            content=list_content_contact[0]
            if "地址" in content:
                address=content
                phone=""
            if "电话" in content:
                phone=content
                address=""
        phone_address=(phone,address)
        return phone_address
     
    #获取每一页20个公司信息存储在list_rows_information
    def Get_page_information(elems):
        #每一页20个公司信息存储在list_rows_information里面
        list_rows_information=[]
        num=len(elems)
        for i in range(num):
            try:
                #公司名称
                company_name=Get_company_name(elems,i)
                
                #主要产品
                main_product=Get_main_product(elems,i)
                
                #联系方式
                phone_address=Get_phone_address(elems,i)
                phone=phone_address[0]
                address=phone_address[1]
                
                list_rows_information.append([company_name,main_product,phone,address])
            except:
                print("error at:",i)
                continue
        return list_rows_information
     
     
      
    #把一页内容写入csv文档 ,list_tableContent为二维列表[[a],[b],[c]]
    def Write_table_to_csv(fileName,list_tableContent):
        #对列表格式修改,字符串写入的格式不对
        file=open(fileName,'w',newline='')
        writer1=csv.writer(file)
        writer1.writerows(list_tableContent)
        file.close()
          
    #写入所有文件
    def Write_allTables_to_csvs(list_pages):
        for i in range(pages_hubei):
            try:
                res=requests.get(list_pages[i])
                soup=bs4.BeautifulSoup(res.text,"lxml")
                #综合信息
                elems=soup.select(".clist_list_content_r")
                #获取每一页20个公司信息存储在list_rows_information
                list_rows_information=Get_page_information(elems)
                filename=str(i+1)+".csv"
                Write_table_to_csv(filename,list_rows_information)
                time.sleep(random.randint(10,15))
            except:
                print("error at:",i)
                continue
    #主函数
    #获取32页主要网址       
    list_pages=Get_sites(site_hubei,pages_hubei)
     
    
    #生产所有csv文件        
    Write_allTables_to_csvs(list_pages)
    
    
    '''
    测试
    i=3
    res=requests.get(list_pages[i])
    soup=bs4.BeautifulSoup(res.text,"lxml")
    elems=soup.select(".clist_list_content_r")
    #联系方式
    elems_contact=elems[2].select(".site_l")
    content_contact=elems_contact[0].text
    content_contact1=content_contact.strip("
    
    	
    ")
    content_contact2=content_contact1.strip("
    ")
    list_content_contact=content_contact2.split("
    
    ")
    
    #有时候信息会缺失,用正则表达式筛选text内容
    if len(list_content_contact)==2:
        phone=list_content_contact[0]
        address=list_content_contact[1]
    if len(list_content_contact)==1:
        content=list_content_contact[0]
        if "地址" in content:
            address=content
            phone=[]
        if "电话" in content:
            phone=content
            address=[]
    '''
    

      

  • 相关阅读:
    【bzoj4591】[Shoi2015]超能粒子炮·改 Lucas定理
    【bzoj1604】[Usaco2008 Open]Cow Neighborhoods 奶牛的邻居 旋转坐标系+并查集+Treap/STL-set
    十分钟看懂图像语义分割技术
    命令行执行python模块时提示ImportError: No module named xxx
    python json与字典对象互相转换
    C#中json字符串的序列化和反序列化
    Python当前线程休眠1秒钟
    python之bytes和string
    Win32 基本文件读写操作
    C# 字符串与字节数组相互转换
  • 原文地址:https://www.cnblogs.com/webRobot/p/5502429.html
Copyright © 2011-2022 走看看