zoukankan      html  css  js  c++  java
  • Tkinter(一)

    采集小工具,目前采集主要针对知乎文章与评论,今天刚开始弄,会不断更新完善

    目前效果(测试站点 :科技;测试连接:http://zhihu.sogou.com/include/pc/pc/topic/topic2_0.html

    1.输入框输入站点与连接

    2.点击提交链接进行采集(会判断链接是否有效,文本框显示输入的站点与连接)

    3.每次输入的站点与连接存到同目录下的txt文件中

    4.点击采集所有采集(将txt中所有链接进行采集)

    数据库显示

    tkinter 代码

    import tkinter
    import tkinter.filedialog
    import tkinter.messagebox
    from tkinter.scrolledtext import ScrolledText
    from threading import Thread
    from yanshi.hot import *
    
    class Cmpfile:
        def __init__(self) :
            self.list1=[]
            self.list2=[]
            self.item={}
            root=tkinter.Tk()
            self.root=root
            self.root.title('知乎')
            self.root.minsize(400,350)
            self.Menu1()
            self.Label1()
            self.root.mainloop()
    
        def thread_up(self,func):
            t = Thread(target=func)  # 此时线程是新建状态
            t.setDaemon(True)
            t.start()  # 启动线程
    
        def up(self):
            entry1=self.entry1.get()
            entry2=self.entry2.get()
    
            self.text.insert('insert',entry1+'-'+entry2+'
    ')
            tkinter.messagebox.showinfo('温馨提示', '开始采集')
            self.item[entry1]=entry2
            print(self.item)
            self.list1.append(self.item)
            a=zhihu(self.list1)
            b=a.starts()
            tkinter.messagebox.showinfo('温馨提示',b)
            if b=='采集结束':
                f = open('./1.txt', 'a')
                f.write(entry1 + ':' + entry2 + '
    ')
                f.close
            self.entry1.delete(0,'end')
            self.entry2.delete(0,'end')
            self.item={}
            self.list1=[]
    
        def load(self):
            self.clear()
            f=open('./1.txt','r')
            one_list=f.readlines()
            for one in one_list:
                self.text.insert('insert',one+'
    ')
            f.close()
    
        def clear(self):
            self.text.delete(0.0,'end')
    
        def all(self):
            f=open('./1.txt','r')
            one_list=f.readlines()
            for one in one_list:
                self.text.insert('insert', one)
                item = {}
                pattern='(.*?)-(.*?)
    '
                re_list=re.findall(pattern,one)
                if re_list==[]:
                    pass
                else:
                    item[re_list[0][0]]=re_list[0][1]
                    self.list2.append(item)
            self.list2=[]
            f.close()
    
        #用place基础布局
        def Menu1(self):
            # 添加菜单
            menu = tkinter.Menu(self.root)
            # 添加查看子menu
            lookmenu = tkinter.Menu(menu, tearoff=0, bg='purple', fg='white')
            # 添加编辑子menu
            menu.add_cascade(menu=lookmenu, label='日志')
            # 添加帮助子menu
            menu.add_cascade(menu=lookmenu, label='帮助')
            #添加登录子menu
            menu.add_cascade(menu=lookmenu, label='登录')
            # 添加查看子menu
            menu.add_cascade(menu=lookmenu, label='查看')
            self.root.config(menu=menu)
    
        def Label1(self):
            label1=tkinter.Label(self.root,text = 'sitename',height=1,width=1,pady=3,bd=3).place(relx=0.02,rely=0.05,relwidth=0.2)
            label2=tkinter.Label(self.root,text = 'link',height=1,width=1,pady=3,bd=3).place(relx=0.02,rely=0.15,relwidth=0.2)
            self.entry1 = tkinter.Entry(self.root, width=40, bg='white', bd=5)
            self.entry1.place(relx=0.25, rely=0.05, relwidth=0.7)
            self.entry2 = tkinter.Entry(self.root, width=40, bg='white', bd=5)
            self.entry2.place(relx=0.25, rely=0.15, relwidth=0.7)
    
            button1 = tkinter.Button(self.root, text='提交链接', height=1, width=8, pady=5, bd=1,command=lambda :self.thread_up(self.up)).place(x=105, y=100)
            button2 = tkinter.Button(self.root, text='载入文本', height=1, width=8, pady=5, bd=1,command=self.load).place(x=190, y=100)
            button3 = tkinter.Button(self.root, text='采集所有', height=1, width=8, pady=5, bd=1,command=self.all).place(x=275, y=100)
            self.text=ScrolledText(self.root,height=8,width=37,bg='white',pady=3,bd=3)
            self.text.place(x=100, y=150)
            button4 = tkinter.Button(self.root, text='清空', height=1, width=8, pady=5, bd=1,command=self.clear).place(x=105 ,y=280)
    
    #实例化对象
    one=Cmpfile()
    

    采集代码另外的py文件中,运行时引用

    import re
    import pymysql
    import time
    import datetime
    import requests
    from lxml import etree
    from multiprocessing.dummy import Pool as ThreadPool
    class zhihu(object):
        def __init__(self,urls):
            self.url_list=urls
            self.headers={
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
            'Connection':'keep-alive',
            'Host':'zhihu.sogou.com',
            'Referer':'http://zhihu.sogou.com/',}
            self.headers1={
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
                'Referer': 'https://www.zhihu.com/',
            }
        def ToResponse(self,res):
            res.encoding=res.apparent_encoding
            args=etree.HTML(res.text)
            return args
    
        def request_url(self,url):
            response=requests.get(url,headers=self.headers)
            response=self.ToResponse(response)
            link_list = response.xpath('//li/p[@class="tit"]/a/@href')
            return link_list
    
        def connectdb(self):
            print('连接到mysql服务器...')
            # 打开数据库连接
            db = pymysql.connect("服务器", "root", "123456", "zhihu",charset='utf8')
            print('连接上了!')
            return db
    
        def get_link(self,item):
            for key,value in item.items():
                url=value
                groupname=key
                link_list=self.request_url(url)
                db = self.connectdb()
                cursor = db.cursor()
                for link in link_list:
                    print(link)
                    response = requests.get(url=link, headers=self.headers1)
                    response = self.ToResponse(response)
                    IR_GROUPNAME = '问答社区'
                    IR_SITENAME='搜狗知乎'
                    IR_CHANNEL=groupname
                    IR_URLNAME=link
                    BBSNUM='0'
                    IR_LASTTIME = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    try:
                        IR_URLTITLE = response.xpath('//h1[@class="QuestionHeader-title"]/text()')[0]
                        IR_QUESTION = ''.join(response.xpath("//span[@class='RichText']//text()")) if response.xpath(
                            "//span[@class='RichText']//text()") else ''
                        IR_RETURN = response.xpath("//div[@class='List-header']//h4/span/text()")[0]
                        IR_FOLLOW = 
                            response.xpath("//div[@class='QuestionFollowStatus']//div[@class='NumberBoard-item'][1]//strong/text()")[0]
                        IR_VIEW = response.xpath("//div[@class='QuestionFollowStatus']//div[@class='NumberBoard-item'][2]//strong/text()")[
                            0]
                        print(IR_GROUPNAME, IR_URLTITLE, IR_QUESTION, IR_RETURN, IR_FOLLOW, IR_VIEW)
    
                        sql = "INSERT INTO wenda (IR_GROUPNAME,IR_SITENAME,IR_CHANNEL,IR_URLTITLE,IR_QUESTION,IR_URLNAME,IR_LASTTIME,IR_VIEW,IR_FOLLOW,IR_RETURN,BBSNUM) VALUES ('" + IR_GROUPNAME + "','" + IR_SITENAME + "','" + IR_CHANNEL + "','" + IR_URLTITLE + "','" + IR_QUESTION + "','" + IR_URLNAME + "','" + IR_LASTTIME + "','" + IR_VIEW+ "','" + IR_FOLLOW + "','" + IR_RETURN + "','" + BBSNUM + "')"
                        try:
                            # 执行sql语句
                            cursor.execute(sql)
                            # 提交到数据库执行
                            db.commit()
                        except Exception as e:
                            # Rollback in case there is any error
                            print('插入数据失败!')
                            print(e)
                            db.rollback()
    
                        #回帖
                        huifu = response.xpath("//div[@class='List']//div[@class='List-item']")
                        print(huifu)
                        num=1
                        for one in huifu:
                            IR_AUTHOR = one.xpath(".//div[@class='AuthorInfo-head']//a[@class='UserLink-link']/text()")[0] if one.xpath(
                                ".//div[@class='AuthorInfo-head']//a[@class='UserLink-link']/text()") else '匿名用户'
                            if IR_AUTHOR == '匿名用户':
                                IR_AUTHOR_LINK = ''
                            else:
                                IR_AUTHOR_LINK = 'https:' + one.xpath(".//div[@class='AuthorInfo-head']//a[@class='UserLink-link']/@href")[
                                    0]
                            IR_RESPONSE = ''.join(one.xpath(".//div[@class='RichContent-inner']//text()"))
                            IR_URLTIME = one.xpath(".//div[@class='ContentItem-time']//span/text()")[0]
                            pattern = r'((u53d1u5e03u4e8e|u7f16u8f91u4e8e).*?(d+-d+-d+|d+:d+))'
                            gone = re.search(pattern, IR_URLTIME).group(1)
                            gtwo = re.search(pattern, IR_URLTIME).group(3)
                            if '昨天' not in gone and '-' not in gtwo:
                                IR_URLTIME = time.strftime("%Y/%m/%d") + ' ' + gtwo
                            elif '-' in gtwo:
                                IR_URLTIME = gtwo
                            else:
                                IR_URLTIME = str(datetime.date.today() - datetime.timedelta(days=1)) + ' ' + gtwo
                            IR_AGREE = one.xpath(".//button[@aria-label='赞同']/text()")[0]
                            if 'K' in IR_AGREE:
                                IR_AGREE=str(int(float(IR_AGREE.replace('K',''))*1000))
    
                            print(IR_AGREE)
                            pattern='d+'
                            IR_COMMENT = one.xpath(".//div[@class='ContentItem-actions RichContent-actions']/button[1]/text()")[0]
                            try:
                                IR_COMMENT = re.search(pattern, IR_COMMENT).group()
                            except:
                                IR_COMMENT = '0'
                            BBSNUM=str(num)
                            sql = "INSERT INTO wenda (IR_GROUPNAME,IR_SITENAME,IR_CHANNEL,IR_URLTITLE,IR_URLNAME,IR_URLTIME,IR_LASTTIME,BBSNUM,IR_AUTHOR,IR_RESPONSE,IR_AGREE,IR_COMMENT) VALUES ('" + IR_GROUPNAME + "','" + IR_SITENAME + "','" + IR_CHANNEL + "','" + IR_URLTITLE + "','" + IR_URLNAME + "','" + IR_URLTIME + "','" + IR_LASTTIME + "','" + BBSNUM + "','" + IR_AUTHOR + "','" + IR_RESPONSE + "','" + IR_AGREE + "','" + IR_COMMENT + "')"
                            try:
                                # 执行sql语句
                                cursor.execute(sql)
                                # 提交到数据库执行
                                db.commit()
                            except Exception as e:
                                # Rollback in case there is any error
                                print('插入数据失败!')
                                print(e)
                                db.rollback()
                            num+=1
                            time.sleep(1)
    
                    except Exception as e:
                        print(e)
        def starts(self):
            try:
                pool = ThreadPool(5)
                time3 = time.time()
                pool.map(self.get_link, self.url_list)
                pool.close()
                pool.join()
                time4 = time.time()
                print ('多线程耗时 : ' + str(time4 - time3) + ' s')
                return '采集结束'
            except:
                return '链接错误'
    

      

  • 相关阅读:
    Windows JScript 在 游览器 中运行 调试 Shell 文件系统
    autohotkey 符号链接 软连接 symbolink
    软链接 硬链接 测试
    SolidWorks 修改 基准面 标准坐标系
    手机 路径 WebDAV 映射 驱动器
    Win10上手机路径
    explorer 命令行
    单位公司 网络 封锁 屏蔽 深信 AC
    cobbler自动化部署原理篇
    Docker四种网络模式
  • 原文地址:https://www.cnblogs.com/snow-l/p/8718677.html
Copyright © 2011-2022 走看看