zoukankan      html  css  js  c++  java
  • 需要模拟点击爬虫小案例

    用了一天时间,写了一个简单功能的小爬虫,用的selenium
    爬虫主要涉及:
    1、模拟用户登录
    2、模拟点击下拉菜单
    3、定位下拉菜单中的文本、点击选择
    4、双击文本,实现隐藏文本==>显示文本
    5、查询后先切换到首页
    6、点击下一页翻页,判断当前页是否是最
    from selenium import webdriver
    from selenium.webdriver import ActionChains
    import re
    import time
    import os
    import random

    '''
    爬虫函数,输入省份、城市、参数
    该函数主要完成省份、城市参数传入
    driver.execute_script("arguments[0].scrollIntoView();", lis_p[p])完成下拉菜单找到省份及城市所在位置
    点击省份、城市下拉菜单,点击查询,等待加载
    先跳转到首页,然后按页循环获取table中的文本内容
    '''
    def paqu(driver,p,c,lis_p,lis_c,p_name):
    driver.find_element_by_xpath(
    '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[11]/div/div[2]/div[1]/span/span/i').click()

    time.sleep(5)

    driver.execute_script("arguments[0].scrollIntoView();", lis_p[p])
    lis_p[p].click()
    time.sleep(5)

    # 城市
    driver.find_element_by_xpath(
    '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[12]/div/div[2]/div[1]/span/span/i').click()
    time.sleep(3)
    driver.execute_script("arguments[0].scrollIntoView();", lis_c[c])
    c_name = lis_c[c].text
    lis_c[c].click()
    time.sleep(3)

    # 查询
    driver.find_element_by_xpath('//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[20]/div/button/span').click()
    time.sleep(5)
    try:
    # 跳到首页
    driver.find_element_by_xpath(
    '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div[3]/section/section[2]/span[1]').click()
    time.sleep(5)
    # 获取一共有多少页
    ul_page = driver.find_element_by_xpath('//*[@id="app"]/div/section/div/section[2]/div/div[2]/div[3]/section/section[2]/div[2]/ul')
    li_page = ul_page.find_elements_by_xpath('li')
    pages = int(li_page[len(li_page)-1].text)
    for i in range(1,pages+1):
    print(i)
    # 客户名称
    try:
    action_chains = ActionChains(driver)

    v_num = driver.find_elements_by_xpath(
    '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div[3]/section/section[1]/div/div[3]/table/tbody/tr/td[3]/div/span')
    list_num = []
    for n in range(len(v_num)):
    list_num.append(v_num[n].text)

    spans = driver.find_elements_by_xpath(
    '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div[3]/section/section[1]/div/div[3]/table/tbody/tr/td[2]/div/span')

    for i in range(len(spans)):
    action_chains.double_click(spans[i]).perform()
    new_time = random.randint(2,5)
    time.sleep(new_time)
    print(p_name+' '+c_name+' '+spans[i].text+' '+list_num[i])

    f.write(p_name+' '+c_name+' '+spans[i].text+' '+list_num[i])
    f.write(' ')

    but = driver.find_element_by_xpath('//*[@id="app"]/div/section/div/section[2]/div/div[2]/div[3]/section/section[2]/div[2]/button[2]')
    flag = but.is_enabled()
    print(flag)
    if flag is True:
    but.click()
    time.sleep(5)
    else:
    print('当前是最后一页')
    except:
    print('查询结果为空')
    driver.find_element_by_xpath(
    '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div[3]/section/section[2]/span[1]').click()
    time.sleep(5)
    except:
    pass

    '''
    点击下拉菜单,获取省份下有多少城市
    '''
    def get_citynum(driver,p):
    driver.find_element_by_xpath(
    '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[11]/div/div[2]/div[1]/span/span/i').click()
    time.sleep(5)
    ul_p = driver.find_element_by_xpath(
    '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[11]/div/div[2]/div[2]/div[1]/div[1]/ul')
    lis_p = ul_p.find_elements_by_xpath('li')
    driver.execute_script("arguments[0].scrollIntoView();", lis_p[p])
    p_name = lis_p[p].text
    lis_p[p].click()
    time.sleep(5)

    # 城市
    driver.find_element_by_xpath(
    '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[12]/div/div[2]/div[1]/span/span/i').click()
    time.sleep(5)
    ul_c = driver.find_element_by_xpath(
    '//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[12]/div/div[2]/div[2]/div[1]/div[1]/ul')
    lis_c = ul_c.find_elements_by_xpath('li')
    num = len(lis_c)
    return num,lis_p,lis_c,p_name


    if __name__ == "__main__":
    # 打开Firefox浏览器 设定等待加载时间
    driver = webdriver.Chrome()
    # 定位节点
    url = '******'
    driver.get(url)
    time.sleep(5)
    # 输入用户名
    driver.find_element_by_xpath('//*[@id="app"]/div/div/form/div[1]/div/label/div/input').clear()
    driver.find_element_by_xpath('//*[@id="app"]/div/div/form/div[1]/div/label/div/input').send_keys('******')
    # 输入登录的密码
    driver.find_element_by_xpath('//*[@id="app"]/div/div/form/div[2]/div/label/div/input').clear()
    time.sleep(2)
    driver.find_element_by_xpath('//*[@id="app"]/div/div/form/div[2]/div/label/div/input').send_keys('******')
    # 输入验证码
    driver.find_element_by_xpath('//*[@id="app"]/div/div/form/div[3]/div/div/label/div/input').send_keys(
    input("输入验证码: "))
    # 点击登录
    driver.find_element_by_xpath('//*[@id="app"]/div/div/form/label[1]/button').click()
    time.sleep(5)
    # 客户分析
    driver.find_element_by_xpath('//*[@id="app"]/div/header/nav/div[8]').click()
    time.sleep(5)
    # 潜在车辆销售分析
    driver.find_element_by_xpath('//*[@id="app"]/div/section/div/section[1]/div/div[2]/div/ul/li[5]').click()
    time.sleep(5)
    for p in range(1,2):
    f = open(r'E:MyPythonProjectxiaoshou_kehuVehicle_distribution'+str(p+1)+'.txt', 'w')
    print('开始爬取第'+str(p+1)+'个省份')
    num,lis_p,lis_c,p_name = get_citynum(driver,p)
    for c in range(4,num):
    paqu(driver, p,c,lis_p,lis_c,p_name)
    print('结束爬取第'+str(p+1)+'个省份')
    f.close()

    自己写的爬虫太简单,需要进一步精进修改,没有深入的去
    思考爬虫设计的够不够完善,用户代理、访问频次等等都没有仔细考虑。
    先记录一下本次爬虫小case,有时间的话会进行完善
  • 相关阅读:
    51nod1278 相离的圆
    CodeForces
    SPOJ
    51nod 1040(欧拉函数)
    51nod1009 51nod1042(数位dp)
    51nod1264 线段相交
    51nod1050 循环数组最大子段和
    Spark SQL UDF示例
    Spark SQL官网阅读笔记
    Spark RDD
  • 原文地址:https://www.cnblogs.com/liuffblog/p/13152506.html
Copyright © 2011-2022 走看看