1、urllib下的模块:request、error、parse、robotparser
import urllib.request
#import re
#from bs4 import BeautifulSoup
#import lxml
BASEURL = 'http://www.xicidaili.com/'
req = urllib.request.Request(BASEURL)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36')
html = urllib.request.urlopen(req).read().decode('utf-8')# 得到入口页面的HTML
#根据得到的页面信息,可以用正则获取信息,也可以用beautifulsoup和lxml来获取
print(html)
2、requests框架:
import requests
from bs4 import BeautifulSoup
import lxml
BASEURL = 'http://www.xicidaili.com' #西刺首页
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
response = requests.get(BASEURL,headers = headers)
soup = BeautifulSoup(response.content,'lxml')#得到页面HTML信息
links = soup.find_all('a',attrs={'class':'more'})#通过beautifulsoup来获取想要得到的内容
soup = BeautifulSoup(html,'lxml')
解析得到的HTML的模块有
import re
form bs4 import BeautifulSoup
form pyquery import PyQuery as pq
selenium自动化测试:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import keys
from selenium.webdriver.support.expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
Flask+Redis:维护IP代理池和cookie池
from multiprocessing import pool:引入进程池
3、scrapy框架(还是在Linux下搞方便):wheel、lxml、PyOpenSSL、Twisted、Pywin32、scrapy