zoukankan      html  css  js  c++  java
  • python 爬取媒体文件(使用chrome代理,启动客户端,有防火墙)

    #coding = utf-8
    '''
    中文转经纬度
    '''
    import time,json
    import urllib.request
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import pandas as pd
    import numpy as np
    
    AK ='C2hKkyF9fHbmzESq6dmSArZIzw8wEiS1'
    table = pd.read_csv('./data/test.csv',encoding='utf-8')
    outfp = open('./data/result_test.csv','w',encoding='utf-8')
    class LoadData:
        def __init__(self):
            print("start")
            self.m_driver = webdriver.Chrome('D:Program Files (x86)ChromeDriverchromedriver.exe')
            self.loc_result = []
    
        def get_uri(self, addr, city = ''):
            # try:
            server  = 'http://api.map.baidu.com/geocoder/v2/?'
            params = urllib.parse.urlencode({'address':addr,'city':city,'ak':AK,'output':'json'})
            self.m_driver.get(server+params)
            bs = BeautifulSoup(self.m_driver.page_source,'lxml')
            # temp = bs.prefix
            result = json.loads(bs.pre.get_text())['result']
    
            location = result.get('location')
            if( location != None ):
                lng = location.get('lng')
                lat = location.get('lat')
            return lng,lat
            # except:
            #     print("error addr:",addr)
            #     return np.NAN,np.NAN
    
        def get_lng_lat(self, addr):
            lng,lat = self.get_uri(addr)
            if((lng == None) or (lat == None)):
                print("error")
            self.loc_result.append([addr,lng,lat])
    
    
    
        def main(self):
            addr_list = table['ADDRESS'].tolist()
    
            [self.get_lng_lat(addr) for addr in addr_list]
    
            outfp.write(str(self.loc_result))
    
    if __name__ == '__main__':
        tStart = time.clock()
    
        LD = LoadData()
        LD.main()
    
        tEnd = time.clock()
        print("%s s"%(tEnd - tStart))

    附录:

    chromdriver.exe与chrome版本映射及下载链接

    https://blog.csdn.net/mmayanshuo/article/details/78962398

  • 相关阅读:
    WPF系列四
    (最近新弄的几个小demo) 之 (模仿百度搜索框)后续更新中,比较实用
    WPF系列二
    使用IHttpAsyncHandler实现服务器推送技术
    JS中的substring和substr函数的区别
    JS图片自动切换
    Builder生成器模式
    Adapter适配器模式
    Singleton单例模式
    Prototype原型模式
  • 原文地址:https://www.cnblogs.com/smuxiaolei/p/10847381.html
Copyright © 2011-2022 走看看