zoukankan      html  css  js  c++  java
  • effect request

    from bs4 import BeautifulSoup
    import os
    filepath = 'D:\pymine\clean\spider_map\baidu_map_html_firstpage_pc_test\'
    pathDir = os.listdir(filepath)
    name_poi_dic = {}
    need_todo_request = ['搜索结果']
    no_list = ['全国范围内未找到相关地点', '共找到0个搜索结果']
    bd_no_this_name_str = '百度对此条无结果'
    #未找到结果,为您提供"大兴店"的搜索结果
    for allDir in pathDir:
        child = os.path.join('%s%s' % (filepath, allDir))
        if child.find('&')>-1 or child.find('170')>-1:
            os.remove(child)
        requested_file = child.split('baidu_map_html_firstpage_pc')[1].split('&')[0].split('.html')[0].replace('\', '')
        name_poi_dic[requested_file] = {}
        name_poi_dic[requested_file]['poi_list'] = []
        mybytes = open(child, 'r', encoding='utf-8')
        soup = BeautifulSoup(mybytes, "lxml")
        soup_text = soup.text
    
        if soup_text.find('全国范围内未找到') > -1:
            name_poi_dic[requested_file]['poi_list'].append(bd_no_this_name_str)
        elif soup_text.find('商户免费标注') > -1:
            name_l, addr_l = soup.find_all(class_='n-blue'), soup.find_all(class_='n-grey')
            len_, len_addr = len(name_l), len(addr_l)
            for index_ in range(0, len_, 1):
                dic_ = {}
                if index_ < len_addr:
                    dic_['name'], dic_['addr'] = name_l[index_].text, addr_l[index_].text
                else:
                    dic_['name'], dic_['addr'] = name_l[index_].text, '百度此处无地址'
                name_poi_dic[requested_file]['poi_list'].append(dic_)
        elif soup_text.find('m.hao123.com') > -1:
            name_l, addr_l = soup.find_all(class_='text-ellipsis -ft-primary -ft-large'), soup.find_all(
                class_='dis-inf text-ellipsis -col-auto')
            # len_ =min(len(name_l),len(addr_l))
            len_, len_addr = len(name_l), len(addr_l)
            for index_ in range(0, len_, 1):
                dic_ = {}
                if index_ < len_addr:
                    dic_['name'], dic_['addr'] = name_l[index_].text.split('.')[1], addr_l[index_].text
                else:
                    dic_['name'], dic_['addr'] = name_l[index_].text.split('.')[1], '百度此处无地址'
                name_poi_dic[requested_file]['poi_list'].append(dic_)
        elif soup_text.find('地址:') > -1:
            dic_ = {}
            dic_['name'], dic_['addr'] = soup.find_all('td').text.split('地址:')
            name_poi_dic[requested_file]['poi_list'].append(dic_)
        else:
            mybytes.close()
            os.remove(child)
            print('TODO', requested_file)
    

      

  • 相关阅读:
    调试某狐木马驱动被杀后系统卡死开机
    情景剧:C/C++中的未定义行为(undefined behavior)
    汇编概念辨析(Intel/AT&T syntax、GAS、NASM)
    软考准考证打印设置(IE1909)
    基于redis的分布式锁 RedissonLock解锁异常解决
    深入理解synchronized
    单利模式
    归并排序
    旧电脑硬盘回收
    萤石、乐橙、3D
  • 原文地址:https://www.cnblogs.com/rsapaper/p/7425285.html
Copyright © 2011-2022 走看看