zoukankan      html  css  js  c++  java
  • bokeyuan_python文章爬去入mongodb读取--LOWBIPROGRAMMER

    # -*- coding: utf-8 -*-
    import requests,os
    from lxml import etree
    from pymongo import *

    class Boke(object):
    def __init__(self):
    self.url ="https://www.cnblogs.com/cate/python/"
    self.headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331'}
    def get_data(self,url):
    response = requests.get(url,headers=self.headers)
    return response.content
    def xml_data(self,data):
    html = etree.HTML(data)
    mes = html.xpath("//div[@class='post_item']")
    for i in mes:
    dict={}
    info_url = i.xpath("./div[@class='post_item_body']/h3/a/@href")[0]
    self.info_data(info_url)
    dict['url'] = info_url
    self.write_dbs(dict)


    def info_data(self,data):
    path = "f:/woc/"
    if not os.path.exists(path):
    os.makedirs(path)
    mes = self.get_data(data)
    html = etree.HTML(mes)
    list = html.xpath("//div[@id='topics']/div[@class='post']")
    # print(list)
    for x in list:
    dictlist = {}
    title = x.xpath("./h1[@class='postTitle']/a/text()")[0]
    info = x.xpath("./div[@class='postBody']//text()")
    dictlist['title'] = title
    dictlist['info'] = info
    self.write1_dbs(dictlist)

    def dbs(self):
    connect = MongoClient('127.0.0.1',27017)
    conn = connect['boke']
    conn1 =conn['zhu']
    conn2 =conn['info']
    return conn1,conn2
    def write_dbs(self,data):
    conn1,conn2 = self.dbs()
    conn1.insert_one(data)
    result=conn1.find()
    for i in result:
    print(i)
    def write1_dbs(self,data):
    conn1, conn2 = self.dbs()
    conn2.insert_one(data)
    result = conn2.find()
    for i in result:
    print(i)


    def run(self):
    url = self.url
    data = self.get_data(url)
    self.xml_data(data)
    if __name__ == '__main__':
    boke = Boke()
    boke.run()
  • 相关阅读:
    HDU 3586 二分答案+树形DP判定
    POJ 3140 树形DP
    POJ 1741 树的点分治
    POJ 1655 求树的重心
    CF 219D 树形DP
    HDU 2196树形DP(2个方向)
    HDU 1520 树形DP入门
    POJ 1159 Palindrome(最长公共子序列)
    树状数组 区间更新 区间查询
    HDU 1556 BIT区间修改+单点查询(fread读入优化)
  • 原文地址:https://www.cnblogs.com/xcsg/p/10138727.html
Copyright © 2011-2022 走看看