1、mysql数据库用于存储大量数据。
2、Navicat for MySQL以图形和表格等形式管理数据库工具。
3、编程语言python3与环境配置
4、pythcharm集成开发环境(社区版)不需要激活
5、Python包管理器Anaconda3(爬虫主要用到两个包requests,pymysql)与环境配置(网上可找安装教程).
链接:https://pan.baidu.com/s/1Zef6oPmtNZ4sWBXyAMBSgA
提取码:am9q
应用:
1、正则表达式提取猫眼top100电影中的电影名称、主演和上映时间
import pymysql import requests import re def get_text(url): r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding return r.text def parse_html(url, list): demo = get_text(url) patern = re.compile('class="name".*?title="(.*?)".*?:(.*?)s*?</p>.*?:(d{4}-d{2}-d{2})', re.S) results = re.findall(patern, demo) for result in results: list.append(result) return list list = [] for i in range(0,10): url = 'https://maoyan.com/board/4?offset='+str(10*i) list = parse_html(url, list) count = 0 for i in list: count = count + 1 print(i) print("一共有"+str(count)+"条数据!")
2、正则表达式提取西北大学讲座信息
import requests import pymysql import re import os def get_text(url): r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding return r.text def parse_html(url, list): demo = get_text(url) patern = re.compile('<li><span class="fr">[(.*?)].*? (.*?)</a>',re.S) results = re.findall(patern, demo) for result in results: list.append(result) return list list = [] url = 'http://computer.swu.edu.cn/s/computer/kxyj2xsky/index.html' list = parse_html(url,list) for i in range(2, 5): url = "http://computer.swu.edu.cn/s/computer/kxyj2xsky/index_"+str(i)+".html" list = parse_html(url, list) count = 0 for i in list: count = count + 1 print(i) print("一共有"+str(count)+"条数据!")
3、爬取图片
import requests import pymysql import re import os def get_text(url): r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding return r.text def parse_html(url, list): demo = get_text(url) patern = re.compile('<li><span class="fr">[(.*?)].*? (.*?)</a>',re.S) results = re.findall(patern, demo) for result in results: list.append(result) return list list = [] url = 'http://computer.swu.edu.cn/s/computer/kxyj2xsky/index.html' list = parse_html(url,list) for i in range(2, 5): url = "http://computer.swu.edu.cn/s/computer/kxyj2xsky/index_"+str(i)+".html" list = parse_html(url, list) count = 0 for i in list: count = count + 1 print(i) print("一共有"+str(count)+"条数据!")
import pymysql import requests from hashlib import md5 import re import os # db = pymysql.connect('localhost', 'root', '1458555801', 'world') # print("数据库连接成功!") # print("---------------------------------------------------") # r = requests.get("https://python123.io/ws/demo.html") # print(r.text) # r = requests.get("https://python123.io/ws/demo.html") # print(r) # # 提取网页文本内容 # print(r.text) # # 提取网页编码方式 # print(r.encoding) # print(r.apparent_encoding) # r.encoding = r.apparent_encoding # # 打印状态码 # print(r.status_code) # # 捕获异常 # print(r.raise_for_status()) def get_text(url): r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding return r.text # print(get_text('https://python123.io/ws/demo.html')) # demo = get_text('https://python123.io/ws/demo.html') # result = re.search('Th.*?ge', demo) # print(result) # print(result.group()) # result2 = re.search('http.*?001', demo) # print(result2.group()) # result3 = re.findall('<p.*?</p>', demo, re.S) # print(result3) def parse_html(url, list): demo = get_text(url) # 将正则表达式编译成正则表达式对象,方便复用该正则表达式 # ".*?" :匹配任意字符串 # [u4e00-u9fa5] :匹配中文 # (d{4}-d{2}-d{2}) : 匹配日期 patern = re.compile('<li><spansclass="fr">[(d{4}-d{2}-d{2})].*? (.*?)</a></li>', re.S) results = re.findall(patern, demo) for result in results: list.append(result) return list list = [] url = 'http://computer.swu.edu.cn/s/computer/kxyj2xsky/index.html' list = parse_html(url, list) for i in range(2,5): # http://computer.swu.edu.cn/s/computer/kxyj2xsky/index_2.html url = 'http://computer.swu.edu.cn/s/computer/kxyj2xsky/index_'+str(i) + '.html' list = parse_html(url, list) count = 0 for i in list: count = count + 1 print(i) print("一共有"+str(count)+"条数据!") # def download_image(url): # r = requests.get(url) # r.raise_for_status() # save_image(r.content) # # def save_image(content): # file_path = '{0}/{1}.{2}'.format('C:/Users/Think/Desktop/image', md5(content).hexdigest(), 'jpg') # if not os.path.exists(file_path): # with open(file_path, 'wb') as f: # f.write(content) # f.close() # for i in list: # download_image(i) # print("下载成功")