zoukankan      html  css  js  c++  java
  • effect request

    from bs4 import BeautifulSoup
    import os
    filepath = 'D:\pymine\clean\spider_map\baidu_map_html_firstpage_pc_test\'
    pathDir = os.listdir(filepath)
    name_poi_dic = {}
    need_todo_request = ['搜索结果']
    no_list = ['全国范围内未找到相关地点', '共找到0个搜索结果']
    bd_no_this_name_str = '百度对此条无结果'
    #未找到结果,为您提供"大兴店"的搜索结果
    for allDir in pathDir:
        child = os.path.join('%s%s' % (filepath, allDir))
        if child.find('&')>-1 or child.find('170')>-1:
            os.remove(child)
        requested_file = child.split('baidu_map_html_firstpage_pc')[1].split('&')[0].split('.html')[0].replace('\', '')
        name_poi_dic[requested_file] = {}
        name_poi_dic[requested_file]['poi_list'] = []
        mybytes = open(child, 'r', encoding='utf-8')
        soup = BeautifulSoup(mybytes, "lxml")
        soup_text = soup.text
    
        if soup_text.find('全国范围内未找到') > -1:
            name_poi_dic[requested_file]['poi_list'].append(bd_no_this_name_str)
        elif soup_text.find('商户免费标注') > -1:
            name_l, addr_l = soup.find_all(class_='n-blue'), soup.find_all(class_='n-grey')
            len_, len_addr = len(name_l), len(addr_l)
            for index_ in range(0, len_, 1):
                dic_ = {}
                if index_ < len_addr:
                    dic_['name'], dic_['addr'] = name_l[index_].text, addr_l[index_].text
                else:
                    dic_['name'], dic_['addr'] = name_l[index_].text, '百度此处无地址'
                name_poi_dic[requested_file]['poi_list'].append(dic_)
        elif soup_text.find('m.hao123.com') > -1:
            name_l, addr_l = soup.find_all(class_='text-ellipsis -ft-primary -ft-large'), soup.find_all(
                class_='dis-inf text-ellipsis -col-auto')
            # len_ =min(len(name_l),len(addr_l))
            len_, len_addr = len(name_l), len(addr_l)
            for index_ in range(0, len_, 1):
                dic_ = {}
                if index_ < len_addr:
                    dic_['name'], dic_['addr'] = name_l[index_].text.split('.')[1], addr_l[index_].text
                else:
                    dic_['name'], dic_['addr'] = name_l[index_].text.split('.')[1], '百度此处无地址'
                name_poi_dic[requested_file]['poi_list'].append(dic_)
        elif soup_text.find('地址:') > -1:
            dic_ = {}
            dic_['name'], dic_['addr'] = soup.find_all('td').text.split('地址:')
            name_poi_dic[requested_file]['poi_list'].append(dic_)
        else:
            mybytes.close()
            os.remove(child)
            print('TODO', requested_file)
    

      

  • 相关阅读:
    PHP中关于字符串的连接
    好用的FireFox(FF)插件
    Scripted Operation
    Scripted device
    chgrp chown
    wait_for_devices
    mysql create user
    mysql
    create user mysql
    Inserting/Removing shutters and filters
  • 原文地址:https://www.cnblogs.com/rsapaper/p/7425285.html
Copyright © 2011-2022 走看看