通过昨天对python爬取数据的学习,对自己的博客地址进行了爬取
获取网页使用requests ,提取信息使用Beautiful Soup,存储使用txt就可以了。
# coding: utf-8
import re
import requests
from bs4 import BeautifulSoup
def get_blog_info():
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Ubuntu Chromium/44.0.2403.89 '
'Chrome/44.0.2403.89 '
'Safari/537.36'}
html = get_page(blog_url)
soup = BeautifulSoup(html, 'lxml')
article_list = soup.find('div', attrs={'id': 'mainContent'})
article_item = article_list.find_all('div', attrs={'class': 'postTitle'})
for ai in article_item:
title = ai.a.text
link = ai.a['href']
print(title)
print(link)
write_to_file(title+' ')
write_to_file(link+'
')
def get_page(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Ubuntu Chromium/44.0.2403.89 '
'Chrome/44.0.2403.89 '
'Safari/537.36'}
response = requests.get(blog_url, headers=headers, timeout=10)
return response.text
except:
return ""
def write_to_file(content):
with open('article.txt', 'a', encoding='utf-8') as f:
f.write(content)
if __name__ == '__main__':
blog_url = "https://www.cnblogs.com/sgh1023/"
get_blog_info()