zoukankan html css js c++ java

5.3Python解析动态页面源代码

首先就是这一些设置一些selenium，不让他弹出浏览器，然后将etree=html.etree

import datetime
import json
import threading
import traceback
import time
from shlex import join

from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import Chrome
import pymysql
import requests
from concurrent.futures import ThreadPoolExecutor
from lxml import html

import re
option = ChromeOptions()
option.add_experimental_option("excludeSwitches", ["enable-automation"])
option.add_argument("--headless")
option.add_argument("--disable-gpu")
option.add_argument('window-size=1920x3000')  # 指定浏览器分辨率
option.add_argument('--disable-gpu')  # 谷歌文档提到需要加上这个属性来规避bug
option.add_argument('--hide-scrollbars')  # 隐藏滚动条, 应对一些特殊页面
option.add_argument('blink-settings=imagesEnabled=false')  # 不加载图片, 提升速度
option.add_argument('--headless')  # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
option.binary_location = r"C:UsersLenovoAppDataLocalGoogleChromeApplicationchrome.exe"  # 手动指定使用的浏览器位置


web = Chrome(options=option)  # 把参数配置设置到浏览器中

etree = html.etree
web.get("https://trp.autonavi.com/diagnosis/rank.do")

之后将获取到的源代码进行re解析（因为这个获取到的代码头尾是有一些别的东西的）

js=web.page_source
obj1 = re.compile(r'<html><head></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">(?P<ul>.*?)</pre></body></html>', re.S)
result1 = obj1.finditer(js)

查看全文

相关阅读:
elasticsearch安装ik分词器
 原来你是这样的JAVA[03]-继承、多态、抽象类
 JAVA入门[23]-SpringBoot配置Swagger2
原来你是这样的JAVA[01]-基础一瞥
 springboot + @scheduled 多任务并发
 chrome解决http自动跳转https问题
 jquery.uploadify+spring mvc实现上传图片
 JAVA POI导出excel
使用ztree展示树形菜单结构
 shiro入门示例

原文地址：https://www.cnblogs.com/zhaoyids/p/14905312.html