zoukankan      html  css  js  c++  java
  • 第一次用python 写的简单爬虫 记录在自己的博客

    #python.py
    from bs4 import BeautifulSoup
    import urllib.request
    from MySqlite import MySqlite
    
    global g_intid
    g_intid=0
    def GetBlogTileAndName(url):
        res = urllib.request.urlopen(url)
        html = res.read()
        res.close()
        str(html, 'utf-8')
        soup=BeautifulSoup(html)
        divs=soup.find_all(attrs={"class":"postTitle"})
    
        for divname in divs:
            print("title:=",divname.a.string,"href:=",divname.a["href"])
            global  g_intid
            g_intid+=1
            x=MySqlite()
            x.InsertDate(g_intid,divname.a["href"],divname.a.string)
    def GetBlogPage(url):
        res = urllib.request.urlopen(url)
        html = res.read()
        res.close()
        str(html, 'utf-8')
        soup=BeautifulSoup(html)
        divPager=soup.find(attrs={"class":"pager"})
        print(divPager.string)


    for i in range(1,8) :
     url=r"http://www.cnblogs.com/FCoding/default.html?page="+str(i)
     GetBlogTileAndName(url)

      

    #MySqlite.py
    
    class MySqlite(object):
        """description of class"""
        def __init__(self, *args):
            return super().__init__(*args)
        def callstr(self,str):
            print(str)
    
        def InsertDate(self,id,url,title):
            conn = sqlite3.connect(r"d:123.db")
            c=conn.cursor()
            #try:
            #    c.execute('create table blog (ID intergeer,url text,title text , PRIMARY KEY(ID))')
            #except ValueError:
            #    print("error My")
            strExe="insert into blog values ({0}, "{1}","{2}")".format(id,url,title)
            print(id)
            #c.execute('insert into blog values (last_insert_rowid(),url,title)')
            c.execute(strExe)
            conn.commit()
            c.close()
            conn.close()
    
        def GetDate(self):
            import sqlite3
            conn = sqlite3.connect(r"d:123.db")
            c=conn.cursor()
            res=c.execute("select count(*) from blog")
            res=c.fetchone()
            print(res[0])
            data=c.execute("select * from blog")
            for item in data:
                for ite in item:
                    print(ite)
            conn.commit()
            c.close()
            conn.close()
    

     简述一下功能:

    通过urllib 下载网页 使用BeautifulSoup 解析

    调用find_all(attrs={"class":"postTitle"}) 

    找到HTML 中所有class=posttitle 的tag

    然后遍历 取出title 和href 保存到数据库中

    此程序 无容错。新手无笑!

  • 相关阅读:
    如何提高MySQL Limit查询的性能
    asp.net cache 缓存
    a标签的target指向iframe
    Entity Framework实例详解
    MySql循环插入数据(定义了存储过程)
    mysql 定义function rand
    [转] Socket心跳包异常检测的C语言实现,服务器与客户端代码案例
    tinyhttpd ------ C 语言实现最简单的 HTTP 服务器
    http通信过程中,Web浏览器与Web服务器之间将完成下列7个步骤
    http请求数据的格式
  • 原文地址:https://www.cnblogs.com/FCoding/p/3496953.html
Copyright © 2011-2022 走看看