#coding=utf8 import requests from bs4 import BeautifulSoup import os import codecs import MySQLdb def mysql(title): conn= MySQLdb.connect( host='127.0.0.1', port = 3306, user='root', passwd='', db ='test', charset="utf8", ) cur = conn.cursor() sqli="insert into qiubai(text) values(%s)" cur.execute(sqli,(title)) cur.close() conn.commit() conn.close() def qiushi(url): headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} content = requests.get(url, headers=headers) all_a = BeautifulSoup(content.text, 'lxml').find('div', class_='col1').find_all('div', class_='content') title_all = [] for a in all_a: title = a.get_text() title_all.append(title) os.chdir("D:mzitu") f = open("111.txt", 'a') for i in title_all: x = i.encode("utf-8") f.write(i.encode('utf-8')) f.write(" ") s=mysql(i.encode('utf-8')) ##这句是保存一份到数据库中,如果没配置数据库的话,可以注释掉 print("已完成下载段子" + i.encode('utf-8')) f.close() url = 'http://www.qiushibaike.com/' qiushi = qiushi(url)
写个这么简答的东西踩的坑有:
os.chdir("D:mzitu")
f = open("111.txt", 'a')
一开始下面写的是D:mzitu.111.txt,拼命的保存,就是看不到数据,我这暴脾气,简直不能忍。
还有直接打印文本内容会出现乱码 然后type()了一下发现是unicode,要用encode('utf-8')转换一下。
后面保存到mysql中又出现乱码,还是不一样的乱码,我真是踏马的万马奔腾一马平川的草泥马,后来发现是我建表没设置utf-8编码。
然后终于完成了这个小小的东西