需求:爬取豆瓣上top250电影信息
结构:title,type,summry,url,rate
from bs4 import BeautifulSoup import requests url = 'https://movie.douban.com/top250' wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text,'lxml') urls = soup.select('#content > div > div.article > ol > li > div > div.info > div.hd > a') titles = soup.select('#content > div > div.article > ol > li > div > div.info > div.hd > a > span:nth-of-type(1)') rates = soup.select('#content > div > div.article > ol > li > div > div.info > div.bd > div > span.rating_num') list = [] data = None for title,url,rate in zip(titles,urls,rates): data = { 'title' :title.get_text(), 'url' :url.get('href'), 'rate' :rate.get_text(), 'type' : None, 'summry' : None } list.append(data) for item in list: url = item['url'] item_wb_data = requests.get(url) soup = BeautifulSoup(item_wb_data.text,'lxml') infos = soup.select('#info > span') summry = soup.select('#link-report > span[property="v:summary"]') flag = 0 movietype = [] for info in infos: if info.get_text() =='类型:': flag = 1 continue if info.get_text() =='制片国家/地区:': flag = 0 break if flag == 1: movietype.append(info.get_text()); item['type'] = str(movietype) if len(summry) > 0: item['summry'] = str(summry[0].get_text()) ''' for each_item in list: print("title : " + each_item['title']) print("type : " + each_item['type']) print("summry : " + each_item['summry']) print("url : " + each_item['url']) print("rate : " + each_item['rate'])''' with open('webData.txt','w') as outFile: for each_item in list: print("title : " + each_item['title'],file = outFile) print("type : " + each_item['type'],file = outFile) if each_item['summry'] == None: print("summry : " ,file = outFile) else: print("summry : " + str(each_item['summry'].encode('GBK', 'ignore')),file = outFile)
print("url : " + each_item['url'],file = outFile) print("rate : " + each_item['rate'],file = outFile)
红色部分是修改过的,原来代码为
print("summry : " + each_item['summry'],file = outFile)
报错内容:
Traceback (most recent call last):
File "testDemo.py", line 58, in <module>
print("summry : " + each_item['summry']),file = outFile)
UnicodeEncodeError: 'gbk' codec can't encode character 'ufc3a' in position 43: illegal multibyte sequence
查过很多资料,其中比较有效的方式,则是忽略gbk编码时不能识别的符号,则为红色标注部分代码,其参考资料来源为:
http://www.crifan.com/unicodeencodeerror_gbk_codec_can_not_encode_character_in_position_illegal_multibyte_sequence/
根据此方式虽然在python运行过程中不在报错,但是得到的内容,在文件中直接打开还是乱码。作如下修改:
with open('webData.txt','w') as outFile: for each_item in list: print("title : " + each_item['title'],file = outFile) print("type : " + each_item['type'],file = outFile) if each_item['summry'] == None: print("summry : " ,file = outFile) else: print("summry : " + each_item['summry'].encode('gbk','ignore').decode('gbk','ignore'),file = outFile) print("url : " + each_item['url'],file = outFile) print("rate : " + each_item['rate'],file = outFile)
搞定!
其他方式解决,参考:
http://jerrypeng.me/2014/02/python-2-unicode-print-pitfall/
代码如下:
sys.stdout = io.TextIOWrapper(sys.stdout,encoding='utf8')
将 Python 使用系统默认的编码设置为utf-8,运行时仍然报错。
UnicodeEncodeError: 'gbk' codec can't encode character 'u2022' in position 190: illegal multibyte sequence
with open('webData.txt','w') as outFile: for each_item in list: print("title : " + each_item['title'],file = outFile) print("type : " + each_item['type'],file = outFile) if each_item['summry'] == None: print("summry : " ,file = outFile) else: print("summry : " + each_item['summry'].encode('gbk','ignore').decode('gbk','ignore'),file = outFile) print("url : " + each_item['url'],file = outFile) print("rate : " + each_item['rate'],file = outFile)