zoukankan      html  css  js  c++  java
  • 慧聪网爬虫

    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    import gevent
    from gevent import monkey;monkey.patch_all()
    import time
    import re
    import random
    
    UA_list = [
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    ,'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; GWX:MANAGED)','Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)','Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)','Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; GWX:MANAGED)']
    
    proxies_list=[{'proxy': 'http:\10.220.70.254:808'}, {'proxy': 'http:\10.221.70.254:808'}, {'proxy': 'http:\10.222.70.254:808'}, {'proxy': 'http:\10.223.70.254:808'}]
    
    headers = {'User-Agent':random.choice(UA_list),'Referer':'http://b2b.hc360.com/'}
    
    def diyu(sheng,shi):
        for i in range(100):
            or_url = 'http://s.hc360.com/?w={}&mc=enterprise&ee={}&z=%D6%D0%B9%FA%3A{}%CA%A1%3A{}'.format(sheng,i+1,sheng,shi)
            res = requests.get(or_url,headers = headers,)
            soup = BeautifulSoup(res.text,'lxml')
            urls = soup.select('dd.til > h3 > a')
            for url in urls:
                return url.get('href')
    
    def url_parser(urld):
        res = requests.get(urld, headers=headers,proxies=random.choice(proxies_list),timeout=60)
        if res.status_code !='404':
            soup = BeautifulSoup(res.text, 'lxml')
            flag = re.findall(r'公司黄页',str(soup))
            if len(flag)>0:
                return url_HYparer(soup)
            else:
                or_url = urld + 'shop/company.html'
                res = requests.get(or_url, headers=headers,proxies=random.choice(proxies_list),timeout=60)
                soup1 = BeautifulSoup(res.text, 'lxml')
                flag1 = re.findall(r'手机极速版',str(soup1))
                flag2 = re.findall(r'未认证 ', str(soup1))
                if len(flag1)>0:
                    return url_SJJSparer(soup1)
                elif len(flag2)>0:
                    return url_uncertifie(soup1)
                else:
                    return url_NSJJSparer(soup1)
    
    def url_NSJJSparer(soup):
    
        data = {
            'conpany_name':soup.select('td.contitlebg > span')[0].text.strip(),
            'name':soup.select('span.bluezzbold.font14')[0].text.strip(),
            'address':soup.select('td.conbg.conbg2 > ul:nth-of-type(1) > li:nth-of-type(2)')[0].get('title'),
            'phone':re.search(r'd{11}|d{4}-d{8}',str(soup)).group()}
        return data
    
    def url_HYparer(soup):
        data = {
        'conpany_name':soup.select('div.sub-info > h1')[0].text,
        'name':soup.select('samp')[0].text,
        'address':soup.select('div.tableCon > div:nth-of-type(2) > ul > li:nth-of-type(3) > span.conRight')[0].text,
        'phone':soup.select('div.tableCon > div:nth-of-type(2) > ul > li:nth-of-type(2) > span.conRight')[0].text
        }
        return data
    
    def url_SJJSparer(soup):
        data = {
            'conpany_name':soup.select('div.ContacCon1 > h3')[0].text.strip(),
            'name':soup.select('div.ContactsName > span > a')[0].text.strip(),
            'address':soup.select('div.ContacCon3 > ul > li:nth-of-type(1) > div.con3Rig')[0].text.strip(),
            'phone':re.search(r'd{11}|d{4}-d{8}',str(soup)).group()}
        return data
    
    def url_uncertifie(soup):
        data = {
            'conpany_name':soup.select('td.contitlebg_1 > span')[0].text.strip(),
            'name':soup.select('span.bluezzbold.font14')[0].text.strip(),
            'address':soup.select('td.conbg.conbg2 > ul:nth-of-type(1) > li:nth-of-type(2)')[0].text.strip(),
            'phone':re.search(r'd{11}|d{4}-d{8}',str(soup)).group()}
        return data
    
    if __name__=='__main__':
        with open('uu.txt', 'r') as f:
            info_total = []
            for i in f:
                try:
                    info_ary = url_parser(i.strip())
                    time.sleep(random.randint(1,5))
                    info_total.append(info_ary)
                    print(len(info_total))
                except Exception as e:
                    print(e, i.strip())
            df = pd.DataFrame(info_total)
            df.to_excel('huicong_beijing.xlsx')
            print('Done')
  • 相关阅读:
    AS将一个项目导入到另一个项目中
    Android Studio出现:Cause: unable to find valid certification path to requested target
    小米手机Toast带app名称
    PopupWindow 点击外部区域无法关闭的问题
    EditText inputType类型整理
    Fragment通过接口回调向父Activity传值
    Android selector一些坑
    Installation failed with message Failed to commit install session 634765663 with command cmd package
    旷视上海研究院机器人方向招聘
    语义SLAM的数据关联和语义定位(四)多目标测量概率模型
  • 原文地址:https://www.cnblogs.com/Erick-L/p/6945009.html
Copyright © 2011-2022 走看看