zoukankan html css js c++ java

selenuim&PhantomJS&Beautifulsoup练习经典实例

# coding = utf-8
__autor__ = 'litao'

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re, time, random
import selenium.common.exceptions
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from bs4 import BeautifulSoup
from selenium.webdriver.common.action_chains import ActionChains
from pymongo import MongoClient

dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.103 Safari/537.36"
)  # 修改刘浏览器请求头的useragent属性
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']  # 设置浏览器为无图和磁盘缓存模式
brower = webdriver.PhantomJS(service_args=SERVICE_ARGS, desired_capabilities=dcap)
brower.set_window_size(1920, 1080)  # 设定对应的分辨率，防止在执行点击操作时浏览器找不到对应的元素
wait = WebDriverWait(brower, 10)  # 设定最大等待时间
brower.get(url="https://www.taobao.com")


def search(retry_times):
    # wait = WebDriverWait(brower, 10) #设定最大等待时间
    # brower.get(url="https://www.taobao.com")
    try:
        input_content = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#q")))
        search_botton = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".btn-search")))
        input_content.send_keys("美食")
        search_botton.click()
        totle = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".total")))
        print("0k")
        print("1")
        pase_page(1)
        return totle.text
    except selenium.common.exceptions.TimeoutException as e:
        print(e)
        if retry_times > 0:
            retry_times -= 1
            return search(retry_times)  # 用于等待超时重新爬取
        return None


def next_page(page_number, retry_times):
    try:
        input_content = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))
        search_botton = wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))
        input_content.clear()
        input_content.send_keys(page_number)
        search_botton.click()
        wait.until(EC.text_to_be_present_in_element(
            (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number)))
        print(str(page_number))
        pase_page(page_number)
    except selenium.common.exceptions.TimeoutException as e:
        print(e)
        if retry_times > 0:
            retry_times -= 1
            return next_page(page_number, retry_times)  # 用于等待超时重新爬取


def pase_page(page_number):
    if page_number == 1:
        list2 = wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, "#J_itemlistPersonality > div:nth-child(1) > div:nth-child(1)")))
        ActionChains(brower).move_to_element(list2).perform()
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#J_itemlistCont")))
    html = brower.page_source
    # print(html+"
"+"
"+"
"+"
")
    html = html.replace("item J_MouserOnverReq ", "item_J_MouserOnverReq")
    soup = BeautifulSoup(html, 'html.parser')
    content = soup.find_all('div', attrs={"class": "item_J_MouserOnverReq"})
    print(len(content))
    for item in content:
        result = {
            "image": item.find('img').get('data-src'),
            "price": item.find(class_="price").text.strip(),
            "deal": item.find(class_="deal-cnt").text.strip()[:-3],
            "title": re.sub('s', '', item.find(class_="title").text.strip()),
            "shop": re.sub('s', '', item.find(class_="shop").text.strip()),
            "location": re.sub('s', '', item.find(class_="location").text.strip())
        }
        print(result)
        save_to_mongodb(result)
    print("**********************************************************************************************")


def save_to_mongodb(product):
    client = MongoClient('127.0.0.1', 27017)
    db = client.taobao
    db["taobao_meishi"].insert(product)


def main():
    try:
        result = search(2)
        if result:
            count_page = int(re.search('.*?(d+).*', result).group(1))
            for i in range(2, count_page + 1):
                time.sleep(random.randint(1, 3))
                print("-----", i)
                next_page(i, 2)

    except Exception as e:
        print("程序运行过程中出现错误，具体错误如下所示：" + '
', e)
    finally:
        brower.close()  # 此处运用try-except-finally结构用于无论个何种因素引起的异常都会在程序退出之前将程序浏览器关闭


if __name__ == "__main__":
    main()

查看全文

相关阅读:
【学习】jquery.placeholder.js让IE浏览器支持html5的placeholder
【特效】体验很好的导航hover效果移出恢复当前位置
 【学习】滚动延迟加载插件scrollLoading用法
 【转载】jQuery全屏滚动插件fullPage.js
【原创】自用css reset
【转载】jQuery手机移动端触屏日历日期选择
 【学习】条码扫描器：QuaggaJS
【学习】苹果iPhone safari浏览器样式重置修复按钮圆角bug
将SeqReader打包成可执行的jar包
 Java向上转型和向下转型

原文地址：https://www.cnblogs.com/crawer-1/p/7636163.html