zoukankan html css js c++ java

个人网站一步一步搭建——（11）使用python爬取博客园数据

开始想着用IronPython库在C#里面直接执行python 方法发现导包很多时候喜欢报错。到时候我用python 做一个web服务直接调用接口

开始爬取博客园数据

爬博客园很简单都是静态数据

思路。

1 爬取随笔分类或许URL

2.逐个爬取分类。获取列表 url

3.爬取文章详情，下载图片

4,替换文章详情图片连接

上代码

import requests
import os
from pyquery import PyQuery as pq
def Request(url,data=""):
  herder={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"}
  req=requests.get(url,headers=herder,params=data)

  if req.status_code==200:
      return req.text
  else:
    return 0



def disfeilei(html):
    doc=pq(html)
    listfls=doc('#sidebar_postcategory').find('li a').items()
    list=[]
    for fl in listfls:
        classfly={
            'url':fl.attr('href'),
            'name':fl.text().split('(')[0],
            'count':fl.text().split('(')[1][0:-1]
        }
        list.append(classfly)
    return list

def Getwzcon(url):
    html=Request(url)
    doc=pq(html)
    con=doc('#main').find('#cnblogs_post_body')
    imglist=con.find("img").items()
    for i in imglist:

       url= i.attr('src')
       index=url.find('797834')+7
       flit=url[index:]
       #保存到项目文件
       path='h:/。net学习/blogs/BLOGS/WebApplication1/images/blogs/'+flit
       #dowimg(url,path)
       #替换图片路径
       i.attr('src',path)
    print(type(con.html()))
    return con.html()
def dowimg(url,path):
    #获取目录
    paths=os.path.dirname(path)
    print(paths)
    #目录是否存在
    if os.path.exists(paths)==False:
        os.makedirs(paths)
    response = requests.get(url).content
    with open(path,'wb')as f:
      f.write(response)
    print("文件下载成功")


def Getwenz(classfly):
    html=Request(classfly)
    doc=pq(html)
    listwzs=doc('#main').find('.entrylist>.entrylistItem').items()
    list=[]
    for i in listwzs:
        title=i.find('.entrylistItemTitle').text()
        url=i.find('.entrylistItemTitle').attr('href')
        desc=i.find('.c_b_p_desc').text()[0:-4]
        entry=i.find('.entrylistItemPostDesc').text().split(" ")
        datatime=entry[2]+" "+entry[3]
        readcount=entry[5][3:-1]
        #获取详情内容
        content=Getwzcon(url)
        #print(entry)

        art={
            'title':title,
            'url':url,
            'desc':desc,
            'datatime':datatime,
            'readcount':readcount,
            'body':content
        }
        print(art)

    return
if __name__ == '__main__':
    url="https://www.cnblogs.com/ruogu/mvc/blog/sidecolumn.aspx"
    data1={'blogApp':'ruogu' }
    textfeilei=Request(url,data1)
    if textfeilei!=0:
        #获取所有分类
      list_fly=disfeilei(textfeilei)
        #遍历分类
      for item in list_fly:
        #添加数据库
        #print(item)
        #获取文章详情
        Getwenz(item['url'])

查看全文

相关阅读:
计算机程序的构造和解释
 StackOverflow之旅<1>------{去掉烦人的"!=null"判断}
tomcat 启动显示指定的服务未安装
 Sql Server 表的复制
 Sql Server配置管理器与 Sql Server Management Studio
“因为数据库正在使用，所以无法获得对数据库的独占访问权。”处理
 Win10磁贴横向排列增加多行多列磁贴横向展示
 解决使用Hyper-v 设置虚拟机网卡出现-从网络文件夹中隐藏
 Hyper-V网络设置（物理机+虚拟机）
易语言等软件自动获取管理员权限，在64位Windows7系统非管理员帐户中执行

原文地址：https://www.cnblogs.com/ruogu/p/11175438.html