zoukankan      html  css  js  c++  java
  • 自动化抓取数据

    import time
    from selenium import webdriver
    from pymouse import PyMouse
    import os
    import random
    from shutil import copy


    class tietajuhe(object):
    def __init__(self):
    self.files_path = r'D:html_dowload\'
    self.m = PyMouse()
    category = ['乡镇', '市政', '园区', '广告']
    try:
    for Category_name in category:
    self.option = webdriver.ChromeOptions()
    self.option.add_argument("--user-data-dir=" + r"D:/Google2data/")
    self.browser = webdriver.Chrome(chrome_options=self.option) # 打开chrome浏览器
    # 工作路径
    self.path = r"D:铁塔聚合810\%s全国百度资讯\" % Category_name
    try:
    self.Html_save(Cat=Category_name)
    except:
    print("保存网页出问题了")
    try:
    self.mkdir_and_file()
    except:
    print("创建文件夹出问题了")
    try:
    self.copy_file()
    except:
    print("转移网页出问题了")
    except:
    pass
    # self.path = r"D:铁塔聚合810乡镇全国百度资讯\"
    # self.Html_save()

    def mkdir_and_file(self):
    # 切换进路径
    os.chdir(self.path)
    # 读取本地时间
    n_time = time.localtime(time.time())
    today_time = str(n_time.tm_mon) + str(n_time.tm_mday)
    # 遍历创建文件文件夹
    for i in range(1, 10):
    filedir = today_time + str(i)
    os.mkdir(filedir)
    path_in = self.path + filedir
    os.chdir(path_in)
    num_score = random.randint(1, 100)
    fp = open("%d.score" % num_score, 'w')
    fp.close()
    os.chdir(self.path)

    def copy_file(self):
    mubiao_dir = self.path
    a_list = []
    b_list = []
    # 遍历得出目标文件夹地址
    for root_a, dirs_a, files_a in os.walk(mubiao_dir):
    a_list.append(dirs_a)

    # 遍历得出文件地址
    for root, dirs, files in os.walk(self.files_path):
    for html_name in files:
    file = self.files_path + html_name
    b_list.append(file)
    print(len(b_list))

    i = 0
    for dir_path in a_list[0]:
    dir = mubiao_dir + dir_path # 目标地址
    # print(dir) #当前路径下所有非目录子文件
    copy(b_list[i], dir) # 复制文件
    os.remove(b_list[i]) # 删除已复制的页面
    i += 1

    def Html_save(self, Cat):
    # key_w = ['视频监控', '智慧乡镇', '乡镇建设', '脱贫攻坚', '民生建设']
    # key_e = ['智能云广播', '智慧人社局', '智慧灯杆', '市政面貌', '智慧监管']
    # key_r = ['智慧园区', '现代化', '招商', '园区规划', '园区监控']
    # key_r = ['媒体融合', '媒体+', '人工智能', '5G', '智能互联']
    Cat_Dic = {
    'key_w' : ['视频监控', '智慧乡镇', '乡镇建设', '脱贫攻坚', '民生建设'],
    'key_e' : ['智能云广播', '智慧人社局', '智慧灯杆', '市政面貌', '智慧监管'],
    'key_r' : ['智慧园区', '现代化', '招商', '园区规划', '园区监控'],
    'key_t' : ['媒体融合', '媒体+', '人工智能', '5G', '智能互联'],
    }
    if Cat == '乡镇':
    Cat_text = Cat_Dic['key_w']
    elif Cat == '市政':
    Cat_text = Cat_Dic['key_e']
    elif Cat == '园区':
    Cat_text = Cat_Dic['key_r']
    elif Cat == '广告':
    Cat_text = Cat_Dic['key_t']
    else:
    pass
    key_w_url = []
    for key in Cat_text:
    # 启用带插件的浏览器
    self.browser.get("https://www.baidu.com/s?ie=utf-8&cl=2&medium=0&rtt=1&bsst=1&rsv_dl=news_t_sk&tn=news&word=%s&rsv_sug3=5&rsv_sug4=284&rsv_sug1=5&rsv_sug2=0&inputT=1286" % key)
    self.browser.maximize_window()
    # time.sleep(99999999999999999999999)
    ret = self.browser.find_elements_by_xpath('//*[contains(@id,"")]/h3/a') # 查询class为item
    print(len(ret))
    num_a = 0
    for i in ret:
    num_a += 1
    if num_a == 3:
    print('*'*99)
    break
    url = i.get_attribute("href")
    key_w_url.append(url)
    self.browser.execute_script("window.open('%s')" % url)
    time.sleep(1.5)
    self.m.click(1800, 50)
    time.sleep(1.5)
    self.m.click(1435, 151)
    time.sleep(3)
    self.m.click(20, 293)

    self.browser.quit()


    if __name__ == '__main__':
    Titan = tietajuhe()
    Titan.__init__()


  • 相关阅读:
    软件工程(2018)第一次作业
    SQA计划
    再冲刺
    第三次冲刺
    第二次冲刺
    小组第一次冲刺
    团队合作初体验
    关于git的认识与想法
    我的第一篇博客
    SQA计划和系统测试规程
  • 原文地址:https://www.cnblogs.com/blog0001/p/13471299.html
Copyright © 2011-2022 走看看