zoukankan html css js c++ java

抓紧时间写项目,也不知道这些算不算

●项目1:写一个能插入图片运行python的ide

#注意在.py文件相同目录下放一个1.png做测试图片
#本质就是用html来实现图片




#写在前面的话:这个程序在python自带ide里面遇到bug就秒退,看不了提示信息
#解决方法如下:

#用cmd 在文件目录里面执行python lastversion.py即可运行
#并且可以进行调试,会返回出错信息,比自带ide好多了.
#总之:别用ide来调试.
#English for upper Chinese:
#use cmd 's command  python lastversion.py to run this programe.
#cause it's more convinient to debug

from PyQt5 import QtWidgets  
from PyQt5.QtWidgets import QFileDialog  
from PyQt5 import QtCore, QtGui, QtWidgets
from PyQt5.QtCore import *
from PyQt5.QtWidgets import * 
from PyQt5 import QtCore, QtGui, QtWidgets
from PyQt5.QtWidgets import QFileDialog
from PyQt5 import QtWidgets  
from PyQt5.QtWidgets import QFileDialog
import codecs
import sys
import os
def i(a,*arg):
    f=open(a,'w')

    f.write(arg[0]) #我之前出错全是因为write里面不能写数字
                 #只能写字符串,这点跟print不一样.#这里面不要忘了写[0]
    f.close()
    pass
class MyWindow(QtWidgets.QWidget):  
    def __init__(self):  
        super(MyWindow,self).__init__()
        self.resize(800,800)
        self.myButton = QtWidgets.QPushButton(self)  
        self.myButton.setObjectName("myButton")  
        self.myButton.setText("Open")  
        self.myButton.clicked.connect(self.load)  
        self.myButton2 = QtWidgets.QPushButton(self)  
        self.myButton2.setObjectName("myButton")  
        self.myButton2.setText("save")  
        self.myButton2.clicked.connect(self.save)
        self.myButton2.move(200,0)
        self.myButton3 = QtWidgets.QPushButton(self)  
        self.myButton3.setObjectName("myButton")  
        self.myButton3.setText("run")  
        self.myButton3.clicked.connect(self.run)
        self.myButton3.move(400,0)



        self.textEdit = QtWidgets.QTextEdit(self)
        self.textEdit.setGeometry(QtCore.QRect(10, 40, 801, 521))
        self.textEdit.setObjectName("textEdit")
        self.textEdit.resize(1500,600)


        #write something
        self.textEdit.insertPlainText('print (42389)
')#
是换行
        self.textEdit.insertPlainText('print (423893)
')
        self.textEdit.insertPlainText('print (423893)
')
        self.textEdit.insertPlainText('print (423893)
')
        #下面就是用html语言来插入图片
        self.textEdit.insertHtml('<img src=1.png>' )

        #用html来保存就行了,利用下面4行测试了一下效果不错.
        
        
        
        
        

        
    def load(self):  
        
        fileName1, filetype = QFileDialog.getOpenFileName(self,  
                                    "选取文件",  
                                    "D:/",  
                                    "Text Files (*.txt)")   #设置文件扩展名过滤,注意用双分号间隔  
        #read就是全读取这些都好使
        text=open(fileName1,'r').read()

        self.textEdit.setText(text)

    def save(self):
        fileName2, ok2 = QFileDialog.getSaveFileName(self,  
                                    "文件保存",  
                                    "D:/",  
                                    "All Files (*);;Text Files (*.txt)")

        my_text=self.textEdit.toHtml()


        my_text=str(my_text)
        i(fileName2,my_text)#i就是读入函数的本体.








        
    def run(self):
        
        a=self.textEdit.toPlainText()
        
        #obj代表图片.

        b=''
        for i in a:
            if i !='':
                b+=i
        a=b
        f = open("file1.py", "w")  # 打开文件
        
        #下面这行write写不了??为什么是不是qt模块冲突
        
        f.write(a)
        
        
        f.close()  #关闭文件
        

        b=os.getcwd() #得到D:桌面使用大全
epos

        #现在只能这样用这个方式调用cmd来运行python
        
        os.system('python '+b+'\file1.py')





        



        
  
if __name__=="__main__":    
        
    f = open("file3.py", "w")  # 打开文件
    
    #下面这行write写不了??为什么是不是qt模块冲突
    f.write('32131231111111111111111111')
    f.close()
    print (4324)

    app=QtWidgets.QApplication(sys.argv)    
    myshow=MyWindow()  
    myshow.show()  
    sys.exit(app.exec_())

View Code

功能:把图片1.png放在这个文件的同目录下,就会在文件里面看到这个图片,利用html来实现的.

里面输入的python代码点run就会运行.带保存和打开文件功能.图片插入信息也能保存下来

●爬取百度贴吧任意一个吧的图片

from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import random
import os
import urllib.request
#把base_url随便改成别的贴吧的吧的首页或者任意一页都行
base_url = "https://tieba.baidu.com/f?kw=%E4%B8%96%E7%95%8C%E6%9D%AF"
num=0
for i in range(0,50):#爬取50页
       #这地方写的不对,因该用正则提取后面的pn
       #下面在加一个判断,是否你贴入的base_url是首页.因为首页后面没有&pn处理不同:
       a=re.findall(".*&pn",base_url)
       shumu=50*i
       print ('正在爬取第'+str(i+1)+'页的所有帖子里面的图片')
       if a==[]:
              #那么我们就加入&pn=...
              url=base_url+'&pn='+str(shumu)
       else:
              url=base_url+str(shumu)
       
       
       #把url地址里面的东西读出来给soup
       try:#用try来解决有时候编码utf-8不好使的问题
        html = urlopen(url).read().decode('utf-8')
       except Exception:
              continue
       soup = BeautifulSoup(html, features='lxml')
       #把地址里面的每一个页码地址都取出来
       #这个正则表达式表示开头必须是/p的,里面有/p后很多数字的,(?=)这个叫断言,利用这个来加限制条件
       #继续搞一个贪婪匹配和非贪婪匹配的区别 .*?5表示非贪婪匹配   235235会出来2个235    .*5是贪婪匹配235235会出来235235
       sub_urls = soup.find_all("a", {"target": "_blank", "href": re.compile("(?=^/p)/p/d*")})
       
       
       for i in range(len(sub_urls)):
        a=sub_urls[i]['href']#利用这个方式提取一个网址里面的href
        
        
        #这里面每一个a就是要爬取的网页
        #下面开始爬取他的图片
        baseurl='https://tieba.baidu.com'
        url=baseurl+str(a)
        try:
         html = urlopen(url).read().decode('utf-8')
        except Exception:
              continue
        soup = BeautifulSoup(html, features='lxml')
        
        #soup里面进入标签的方法是.标签就行了所谓标签就是<a>...<a>这里面东西就是a标签里面的内容
        
        tupian=soup.cc

        #继续加一个功能如果需要保存到的dir不存在就生成,存在就pass
        aaa=os.path.exists('d:/tupian')
        if aaa!=True:
               os.makedirs('d:/tupian')
        #先设置一下保存的位置:
        dir=r'D:	upian'
        
#这个目录先自己手动创立一下
        #然后直接find img get src但是有的cc里面没有图片
        
        
        try:
          if tupian!=None and tupian.find_all('img')!=None:
                 img_srcall = tupian.find_all('img')
                 #注意find的话后面可以直接.get  find_all后面不能直接.get

                 
                 
                 
                 
                 
                 for _ in img_srcall:
                 #这个pic_name还必须要写一个拓展名
                        img_src=_.get('src')
                        #继续再加个判断只下载有size的图片,这样就跳过了图标文件
                        a=_.get('size')
                        #这个get方法真心好用,他的效果是如有有size这个属性我们
                        #就返回他,否则我们返回None,永远不会报错的.
                        
                        if a!=None and int(a)>30000:
                               pic_name=str(num)
                               pic_name=pic_name+img_src[-4:]
                               pic_name=os.path.join(dir,pic_name)
                               urllib.request.urlretrieve(img_src, pic_name)
                               
                               num+=1
        except Exception:
              continue

View Code

功能:把一个吧里面从任意一页开始开始爬所有他和他后面的页里面的size大于30000的图片.保存到D: upian

细节:利用urlopen模块解析url再decode成urf-8.因为有中文.配合正则表达式和beautifulsoup来匹配图片url.利用urllib.request.urlretrieve来下载图片

●用python建立最简单的服务器

import sys, os, subprocess
from http.server import BaseHTTPRequestHandler,HTTPServer

class ServerException(Exception):
    '''服务器内部错误'''
    pass
#下面是所有事件的处理
class base_case(object):#这个类是所有处理的共性.所以设立一个base_case来描述,下面的处理来
    #继承基类就行了
    '''条件处理基类'''

    def handle_file(self, handler, full_path):
        try:
            with open(full_path, 'rb') as reader:
                content = reader.read() #read读取全部内容
                
            handler.send_content(content)
        except IOError as msg:
            msg = "'{0}' cannot be read: {1}".format(full_path, msg)
            handler.handle_error(msg)

    def index_path(self, handler):
        return os.path.join(handler.full_path, 'index.html')

    def test(self, handler):
        assert False, 'Not implemented.'

    def act(self, handler):
        assert False, 'Not implemented.'


class case_no_file(base_case):
    '''文件或目录不存在'''

    def test(self, handler):
        return not os.path.exists(handler.full_path)

    def act(self, handler):
        raise ServerException("'{0}' not found".format(handler.path))



class case_cgi_file(base_case):
    '''可执行脚本'''

    def run_cgi(self, handler):
        print('3333333')
        print(handler.full_path)
        print(type(handler.full_path))
        #subprocess 就是一个类似cmd 的命令:感觉跟os模块os.system('python '+handler.full_path)一样
        #但是os.system运行后返回0,subprocess运行后返回运行结果.所以我们用subprocess
        #data=os.system('python '+str(handler.full_path))
        #print(data)
        #print(11111111111)
        data = subprocess.check_output(["python", handler.full_path],shell=False)
        print(66666666)
        print(data)
        handler.send_content(data)

    def test(self, handler):
        return os.path.isfile(handler.full_path) and 
               handler.full_path.endswith('.py')

    def act(self, handler):
        self.run_cgi(handler)



class case_existing_file(base_case):
    '''文件存在的情况'''

    def test(self, handler):
        return os.path.isfile(handler.full_path)

    def act(self, handler):
        self.handle_file(handler, handler.full_path)



class case_directory_index_file(base_case):#添加根目录里面index.html文件,这样网页打开就是这个
                                          #index

    '''在根路径下返回主页文件'''

    def test(self, handler):
        print('什么是handler')
        print(handler.full_path)
        #从这2个print知道handler就是RequestHandler的一个实例.这个框架自动设置好了
        #handler就是RequestHandler的一个实例
        return os.path.isdir(handler.full_path) and 
               os.path.isfile(self.index_path(handler))

    def act(self, handler):
        self.handle_file(handler, self.index_path(handler))



class case_always_fail(base_case):
    '''默认处理'''

    def test(self, handler):
        return True

    def act(self, handler):
        raise ServerException("Unknown object '{0}'".format(handler.path))

#先写网络主体:就是写一个返回信息处理函数.必须继承一个HTTPRequestHandler
#这里选择BaseHTTPRequestHandler

class RequestHandler(BaseHTTPRequestHandler):
    '''
    请求路径合法则返回相应处理
    否则返回错误页面
    '''
    Error_Page = """
            <html>
            <body>
            <h1>Error accessing {path}</h1>
            <p>{msg}</p>
            </body>
            </html>
            """
    Cases = [case_no_file(),
             case_cgi_file(),
             case_existing_file(),
             case_directory_index_file(),
             case_always_fail()]



    def do_GET(self):#名字一定是do_GET.GET请求自动触发这个函数
        try:

            # 得到完整的请求路径
            print('path')
            print(self.path)
            #windows目录有bug:需要把/改成
            import re
            self.path= (re.sub(r'/', r'\', self.path) )
            print('pathnow')
            print(self.path)
            self.full_path = os.getcwd() + self.path
            print('什么是self.full_path')
            print(self.full_path)

            # 遍历所有的情况并处理
            for case in self.Cases:
                if case.test(self):
                    case.act(self)
                    break

        # 处理异常
        except Exception as msg:#错误时候运行这下面的代码,显示出错原因
            self.handle_error(msg)

    def handle_error(self, msg):
            # 错误页面模板

        content = self.Error_Page.format(path=self.path, msg=msg)
        self.send_content(content.encode("utf-8"), 404)

    # 发送数据到客户端,#错误就调用content=error page 正确就调用content=index page
    #content里面给html就够了
    def send_content(self, content, status=200):#这个函数把信息写到网页上
        self.send_response(status)
        self.send_header("Content-type", "text/html")
        self.send_header("Content-Length", str(len(content)))
        self.end_headers()
        self.wfile.write(content)




if __name__ == '__main__':
    serverAddress = ('', 8080)
    server = HTTPServer(serverAddress, RequestHandler)
    server.serve_forever()

View Code

功能:通过http.server模块建立BaseHTTPRequestHandler的方法.利用subprocess模块实现cgi编程,输入time.py观看当前时间.

补:因为windows目录用所以在linux上要做对应路径的修改.上面程序是在windows上跑的.

●用PIL.image,numpy通过欧式度量来对图片进行马赛克处理

# coding:utf-8
"""
使用 Python 创建照片马赛克

输入一张目标照片和多张替换照片，将目标照片按网格划分为许多小方块，然后将每个小方块替换为颜色值最
接近的那张替换照片，就形成了马赛克效果。
"""

import argparse
import os

import numpy as np
from PIL import Image


def splitImage(image, size):
    """
    将图像按网格划分成多个小图像

    @param {Image} image PIL Image 对象
    @param {Tuple[int, int]} size 网格的行数和列数
    @return {List[Image]} 小图像列表
    """

    W, H = image.size[0], image.size[1]
    m, n = size
    w, h = int(W / n), int(H / m)
    imgs = []
    # 先按行再按列裁剪出 m * n 个小图像.
    # !!!也就是说只处理整除的那部分图像,不整除剩下的边角忽略了.也就是扔了
    for j in range(m):
        for i in range(n):
            # 坐标原点在图像左上角
            #crop函数带的参数为(起始点的横坐标，起始点的纵坐标，终点的横,终点的列）
            #这个网上有一些把crop的参数写错了.经过测试上面一行的说法是对的.
            imgs.append(image.crop((i * w, j * h, (i + 1) * w, (j + 1) * h)))
    return imgs


def getImages(imageDir):
    """
    从给定目录里加载所有替换图像

    @param {str} imageDir 目录路径
    @return {List[Image]} 替换图像列表
    """

    files = os.listdir(imageDir)
    images = []
    for file in files:
        # 得到文件绝对路径
        #为了避免出错,这里面用os.path.abspath来对路径进行确认
        filePath = os.path.abspath(os.path.join(imageDir, file))
        try:
            fp = open(filePath, "rb")#貌似python 3里面开文件都需要写b.不写b就出错
            im = Image.open(fp) #用image的方法打开,这样存进去的就是图片对象.
            images.append(im)
            # 确定了图像信息，但没有加载全部图像数据，用到时才会
            #把图片load进内存,然后关闭文件即可.这样操作更安全,如果下面3行不写也一样能跑.
            im.load()
            # 用完关闭文件，防止资源泄露
            fp.close()

        except: #遇到某个图片识别不了,read不出来直接跳过.少几个没关系
            # 加载某个图像识别，直接跳过
            print("Invalid image: %s" % (filePath,))
    return images


def getAverageRGB(image):
    """
    计算图像的平均 RGB 值

    将图像包含的每个像素点的 R、G、B 值分别累加，然后除以像素点数，就得到图像的平均 R、G、B
    值

    @param {Image} image PIL Image 对象
    @return {Tuple[int, int, int]} 平均 RGB 值
    """

    # 计算像素点数
    npixels = image.size[0] * image.size[1]
    # 获得图像包含的每种颜色及其计数，结果类似 [(cnt1, (r1, g1, b1)), ...]
    cols = image.getcolors(npixels)
    # 获得每种颜色的 R、G、B 累加值，结果类似 [(c1 * r1, c1 * g1, c1 * g2), ...]
    sumRGB = [(x[0] * x[1][0], x[0] * x[1][1], x[0] * x[1][2]) for x in cols]
    # 分别计算所有颜色的 R、G、B 平均值，算法类似(sum(ci * ri)/np, sum(ci * gi)/np,
    # sum(ci * bi)/np)
    # zip 的结果类似[(c1 * r1, c2 * r2, ..), (c1 * g1, c1 * g2, ...), (c1 * b1,
    # c1 * b2, ...)]
    avg = tuple([int(sum(x) / npixels) for x in zip(*sumRGB)])#用*解包.来组合结果非常常用的技巧
    return avg


def getAverageRGBNumpy(image):#是一个替换上面函数的方法,使用这个来计算比上面的快多了!
    """
    计算图像的平均 RGB 值，使用 numpy 来计算以提升性能

    @param {Image} image PIL Image 对象
    @return {Tuple[int, int, int]} 平均 RGB 值
    """

    # 将 PIL Image 对象转换为 numpy 数据数组
    im = np.array(image)

    # 获得图像的宽、高和深度
    w, h, d = im.shape
    # 将数据数组变形并计算平均值
    #numpy转化后的第三个坐标取0就是r值!取1就是g取2就是b.很方便来计算rgb值
    #.所以直接average即可.
    return tuple(np.average(im.reshape(w * h, d), axis=0))


def getBestMatchIndex(input_avg, avgs):
    """
    找出颜色值最接近的索引

    把颜色值看做三维空间里的一个点，依次计算目标点跟列表里每个点在三维空间里的距离，从而得到距
    离最近的那个点的索引。

    @param {Tuple[int, int, int]} input_avg 目标颜色值
    @param {List[Tuple[int, int, int]]} avgs 要搜索的颜色值列表
    @return {int} 命中元素的索引
    """
    #这函数是马赛克算法的核心.就是找最接近的小图给贴上.但是函数本身很简单
    index = 0
    min_index = 0
    min_dist = float("inf")
    for val in avgs:
        # 三维空间两点距离计算公式 (x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2)
        # + (z1 - z2) * (z1 - z2)，这里只需要比较大小，所以无需求平方根值
        dist = ((val[0] - input_avg[0]) * (val[0] - input_avg[0]) +
                (val[1] - input_avg[1]) * (val[1] - input_avg[1]) +
                (val[2] - input_avg[2]) * (val[2] - input_avg[2]))
        if dist < min_dist:
            min_dist = dist
            min_index = index
        index += 1

    return min_index


def createImageGrid(images, dims):#拼积木即可.先创立一个空白大图像,小图像挨个paste即可.
    """
    将图像列表里的小图像按先行后列的顺序拼接为一个大图像

    @param {List[Image]} images 小图像列表
    @param {Tuple[int, int]} dims 大图像的行数和列数
    @return Image 拼接得到的大图像
    """

    m, n = dims

    # 确保小图像个数满足要求
    assert m * n == len(images)

    # 计算所有小图像的最大宽度和高度
    width = max([img.size[0] for img in images])
    height = max([img.size[1] for img in images])

    # 创建大图像对象
    grid_img = Image.new('RGB', (n * width, m * height))

    # 依次将每个小图像粘贴到大图像里
    for index in range(len(images)):
        # 计算要粘贴到网格的哪行
        row = int(index / n)
        # 计算要粘贴到网格的哪列
        col = index - n * row
        # 根据行列数以及网格的大小得到网格的左上角坐标，把小图像粘贴到这里
        grid_img.paste(images[index], (col * width, row * height))

    return grid_img


def createPhotomosaic(target_image, input_images, grid_size,
                      reuse_images=True):
    """
    图片马赛克生成

    @param {Image} target_image 目标图像
    @param {List[Image]} input_images 替换图像列表
    @param {Tuple[int, int]} grid_size 网格行数和列数
    @param {bool} reuse_images 是否允许重复使用替换图像
    @return {Image} 马赛克图像
    """

    # 将目标图像切成网格小图像
    print('splitting input image...')
    target_images = splitImage(target_image, grid_size)

    # 为每个网格小图像在替换图像列表里找到颜色最相似的替换图像
    print('finding image matches...')
    output_images = []
    # 分 10 组进行，每组完成后打印进度信息，避免用户长时间等待
    count = 0
    batch_size = int(len(target_images) / 10)

    # 计算替换图像列表里每个图像的颜色平均值
    avgs = []
    for img in input_images:
        #这里用getAverageRGBNumpy果然变快了不少
        avgs.append(getAverageRGBNumpy(img))

    # 对每个网格小图像，从替换图像列表找到颜色最相似的那个，添加到 output_images 里
    for img in target_images:
        # 计算颜色平均值
        avg = getAverageRGB(img)
        # 找到最匹配的那个小图像，添加到 output_images 里
        match_index = getBestMatchIndex(avg, avgs)
        output_images.append(input_images[match_index])
        # 如果完成了一组，打印进度信息
        if count > 0 and batch_size > 10 and count % batch_size == 0:
            print('processed %d of %d...' % (count, len(target_images)))
        count += 1
        # 如果不允许重用替换图像，则用过后就从列表里移除,我感觉这句话没用.图像肯定要重复使用才对.
        if not reuse_images:
            input_images.remove(match)

    # 将 output_images 里的图像按网格大小拼接成一个大图像
    print('creating mosaic...')
    mosaic_image = createImageGrid(output_images, grid_size)

    return mosaic_image


def main():


    #第一种控制程序的方式:
    ## 定义程序接收的命令行参数.
    ##用argparse模块可以在cmd中给这个.py的程序来加入参数来控制这个main()函数
    #parser = argparse.ArgumentParser(         #初始化一个分析器
    #    description='Creates a photomosaic from input images')
    #parser.add_argument('-target-image', dest='target_image', required=True)
    #parser.add_argument('--input-folder', dest='input_folder', required=True)
    #parser.add_argument('--grid-size', nargs=2,
    #                    dest='grid_size', required=True)
    #parser.add_argument('--output-file', dest='outfile', required=False)

    # 解析命令行参数
    #args = parser.parse_args()
    #第二种控制参数方法:把参数传给args类,我用这个方法吧.上面命令行不太会用

    import os
    print(os.getcwd())
    home=os.getcwd()
    print(home)
    class args():
        grid_size=128,128 #切成128*128的小图片
        target_image=home+r'	est-dataa.jpg'  #路径要写r,不然识别不出来
        input_folder=home+r'	est-dataset1'   #马赛克图块的目录
        outfile=home+r'mosaiced.png'          #马赛克后的图片保存目录
    print(args.target_image)
    print(type(home))

    # 网格大小
    grid_size = (int(args.grid_size[0]), int(args.grid_size[1]))
    print(grid_size)
    # 马赛克图像保存路径，默认为 mosaic.png
    output_filename = 'mosaic.png'
    if args.outfile:
        output_filename = args.outfile

    # 打开目标图像
    print('reading targe image...')
    print(args.target_image)
    target_image = Image.open(args.target_image)
    print(getAverageRGBNumpy(target_image))
    print(00000000000000000)
    # 从指定文件夹下加载所有替换图像
    print('reading input images...')
    #看getImages怎么写.从一个文件夹读入图片.这个功能python没有,自己写一个
    input_images = getImages(args.input_folder)
    # 如果替换图像列表为空则退出程序
    if input_images == []:
        print('No input images found in %s. Exiting.' % (args.input_folder, ))
        exit()

    # 将所有替换图像缩放到指定的网格大小
    print('resizing images...')
    dims = (int(target_image.size[0] / grid_size[1]),
            int(target_image.size[1] / grid_size[0]))
    #dims表示最后的网格是几乘几的,如果不整除呢?这里面就是除不开.dims是每一个小图片的尺寸
    print(target_image.size)#看出图片的大小是以像素为单位的
    print(dims)
    for img in input_images:
        img.thumbnail(dims) #thumbnail  对图像进行缩小
        

    # 生成马赛克图像
    print('starting photomosaic creation...')
    mosaic_image = createPhotomosaic(target_image, input_images, grid_size)

    # 保存马赛克图像
    mosaic_image.save(output_filename, 'PNG')
    print("saved output to %s" % (output_filename,))

    print('done.')


if __name__ == '__main__':
    main()

View Code

细节:image.crop image.paste来对文件进行裁剪粘贴成小块.每一个小块用numpy转化成矩阵得到第三个维度的rgb值.

对每一个小块找库中纯色小块跟当前小块3维欧式度量最近的小块来替换当前小块即可.

●用PIL模块把图片转化成字符画

from PIL import Image
import argparse

##赋予参数的方法1:用argparse来设置命令行的参数用cmd python 文件.py +参数来运行
##命令行输入参数处理
#parser = argparse.ArgumentParser()

#parser.add_argument('file')     #输入文件
#parser.add_argument('-o', '--output')   #输出文件
#parser.add_argument('--width', type = int, default = 80) #输出字符画宽
#parser.add_argument('--height', type = int, default = 80) #输出字符画高

##获取参数
#args = parser.parse_args()

#我用第二个方法:直接在py里面改
class args():#设置参数类
    file =r'D:桌面使用大全
epos	est-data.jpeg'
    width=50
    height=50
    output=r'D:桌面使用大全
epos	est-datachar_b.txt'

IMG = args.file
WIDTH = args.width
HEIGHT = args.height
OUTPUT = args.output

ascii_char = list("$@B%8&WM#*oahkbdpqwmZO0QLCJUYXzcvunxrjft/|()1{}[]?-_+~<>i!lI;:,"^`'. ")
#解释一下这个accii_char的构造:他其实是一个灰度值表.
#可以用任意东西来替换里面的内容.$就是代表灰度值最小的也就是亮度值最小的即黑色,
#后面I1I''这些字符就是亮度值很大的接近白色的色块.这个程序本质就是把原图像素用这些字符替换即可.


# 将256灰度映射到70个字符上
def get_char(r,g,b,alpha = 256):
    if alpha == 0:
        return ' '
    length = len(ascii_char)
    gray = int(0.2126 * r + 0.7152 * g + 0.0722 * b)
   # gray是灰度值得到的数在0到255之间
   

    unit = (256.0 + 1)/length
    #按比例转化到70个字符上.也就是上面给的长list中的字符
    return ascii_char[int(gray/unit)]

if __name__ == '__main__':

    im = Image.open(IMG)
    im = im.resize((WIDTH,HEIGHT), Image.NEAREST)

    txt = ""

    for i in range(HEIGHT):
        for j in range(WIDTH):#im.getpixel((j,i))表示提取(j,i)位置的元素得到一个(r,g,b)的tuple
                             #把一个数组或者tuple赋值给一个函数get_char当然需要拆包语法*
                             #也就是说他等价于他下面一句
            txt += get_char(*im.getpixel((j,i)))
            #txt += get_char(im.getpixel((j,i))[0],im.getpixel((j,i))[1],im.getpixel((j,i))[2])
        txt += '
'

    print(txt)

    #字符画输出到文件
    if OUTPUT:
        with open(OUTPUT,'w') as f:
            f.write(txt)
    else:#如果output参数不给就自己做一个output.txt
        #注意对生成的文件output观看的时候用写字板或者word,不要用记事本.因为他不会行对齐.
        with open("output.txt",'w') as f:
            f.write(txt)

View Code

●pyQt5实现简单浏览器:前进后退刷新,打开多个网页标签功能.

from PyQt5.QtCore import *
from PyQt5.QtWidgets import *
from PyQt5.QtGui import *
from PyQt5.QtWebKitWidgets import *

import sys

class MainWindow(QMainWindow):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # 设置窗口标题
        self.resize(1200,800)
        self.setWindowTitle('My Browser')
        # 设置窗口图标
        self.setWindowIcon(QIcon('icons/penguin.png'))
        self.show()

        # 添加 URL 地址栏
        self.urlbar = QLineEdit()
        # 让地址栏能响应回车按键信号
        self.urlbar.returnPressed.connect(self.navigate_to_url)

        # 添加标签栏
        self.tabs = QTabWidget()
        self.tabs.setDocumentMode(True)
        self.tabs.tabBarDoubleClicked.connect(self.tab_open_doubleclick)
        self.tabs.currentChanged.connect(self.current_tab_changed)
        self.tabs.setTabsClosable(True)
        self.tabs.tabCloseRequested.connect(self.close_current_tab)

        self.add_new_tab(QUrl('http://shiyanlou.com'), 'Homepage')

        self.setCentralWidget(self.tabs)

        



        # 添加导航栏
        navigation_bar = QToolBar('Navigation')
        # 设定图标的大小
        navigation_bar.setIconSize(QSize(16, 16))
        self.addToolBar(navigation_bar)

        # 添加前进、后退、停止加载和刷新的按钮
        back_button = QAction(QIcon('icons/back.png'), 'Back', self)
        next_button = QAction(QIcon('icons/next.png'), 'Forward', self)
        stop_button = QAction(QIcon('icons/cross.png'), 'stop', self)
        reload_button = QAction(QIcon('icons/renew.png'), 'reload', self)
        new_tab_action = QAction(QIcon('icons/add_page.png'), 'New Page', self)

        back_button.triggered.connect(self.tabs.currentWidget().back)
        next_button.triggered.connect(self.tabs.currentWidget().forward)
        stop_button.triggered.connect(self.tabs.currentWidget().stop)
        reload_button.triggered.connect(self.tabs.currentWidget().reload)
        new_tab_action.triggered.connect(self.tab_open_doubleclick)
        # 将按钮添加到导航栏上
        navigation_bar.addAction(back_button)
        navigation_bar.addAction(next_button)
        navigation_bar.addAction(stop_button)
        navigation_bar.addAction(reload_button)
        navigation_bar.addAction(new_tab_action)


        navigation_bar.addSeparator()
        navigation_bar.addWidget(self.urlbar)

    
    # 响应回车按钮，将浏览器当前访问的 URL 设置为用户输入的 URL
    def navigate_to_url(self):
        q = QUrl(self.urlbar.text())

        if q.scheme() == '':
            q.setScheme('http')
        self.tabs.currentWidget().setUrl(q)

    def renew_urlbar(self, q, browser=None):
        # 如果不是当前窗口所展示的网页则不更新 URL
        if browser != self.tabs.currentWidget():
            return
        # 将当前网页的链接更新到地址栏
        self.urlbar.setText(q.toString())
        self.urlbar.setCursorPosition(0)

    # 添加新的标签页
    def add_new_tab(self, qurl=QUrl(''), label='Blank'):
        # 为标签创建新网页
        browser = QWebView()
        browser.setUrl(qurl)
        i = self.tabs.addTab(browser, label)

        self.tabs.setCurrentIndex(i)

        browser.urlChanged.connect(lambda qurl, browser=browser: self.renew_urlbar(qurl, browser))

        browser.loadFinished.connect(lambda _, i=i, browser=browser: 
            self.tabs.setTabText(i, browser.page().mainFrame().title()))

    # 双击标签栏打开新页面
    def tab_open_doubleclick(self, i):
        
            self.add_new_tab()

    # 
    def current_tab_changed(self, i):
        qurl = self.tabs.currentWidget().url()
        self.renew_urlbar(qurl, self.tabs.currentWidget())

    def close_current_tab(self, i):
        # 如果当前标签页只剩下一个则不关闭
        if self.tabs.count() < 2:
            return
        self.tabs.removeTab(i)

        
# 创建应用
app = QApplication(sys.argv)
# 创建主窗口
window = MainWindow()
# 显示窗口
window.show()
# 运行应用，并监听事件
app.exec_()

View Code

●用dnn对汽车销量这个比赛的代码总结:

dnn加batch normalization

"""
View more, visit my tutorial page: https://morvanzhou.github.io/tutorials/
My Youtube Channel: https://www.youtube.com/user/MorvanZhou
Dependencies:
torch: 0.1.11
matplotlib
"""
import torch
from torch.autograd import Variable
from torch import nn
from torch.nn import init
import torch.utils.data as Data
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np

import torch
from torch.autograd import Variable
import torch.nn.functional as F
import matplotlib.pyplot as plt

# torch.manual_seed(1)    # reproducible

x = torch.unsqueeze(torch.linspace(-1, 1, 100), dim=1)  # x data (tensor), shape=(100, 1)
y = x.pow(2) + 0.2*torch.rand(x.size())                 # noisy y data (tensor), shape=(100, 1)

# torch can only train on Variable, so convert them to Variable
x, y = Variable(x), Variable(y)


# plt.scatter(x.data.numpy(), y.data.numpy())
# plt.show()


class Net(nn.Module):
    def __init__(self):
            super(Net, self).__init__()
              #对初始数据归一化
            self.dnn = nn.Sequential(
            nn.BatchNorm1d(1, momentum=0.5),
            # input shape (1, 28, 28)
            nn.Linear(1, 100),                 # if want same width and length of this image after con2d, padding=(kernel_size-1)/2 if stride=1
                                         # output shape (16, 28, 28)
            #batchnormld的参数是上一层的输出
            nn.BatchNorm1d(100, momentum=0.5),# activation
            nn.ReLU(),
            nn.Linear(100, 10),                 # if want same width and length of this image after con2d, padding=(kernel_size-1)/2 if stride=1
                                         # output shape (16, 28, 28)
            
            nn.BatchNorm1d(10, momentum=0.5),# activation
            nn.ReLU(), # choose max value in 2x2 area, output shape (16, 14, 14)
            nn.Linear(10, 1), 


        )
    def forward(self, x):
            x=self.dnn(x)
            return x

net = Net()     # define the network
print(net)  # net architecture

optimizer = torch.optim.SGD(net.parameters(), lr=0.5)
loss_func = torch.nn.MSELoss()  # this is for regression mean squared loss

plt.ion()   # something about plotting

for t in range(100):
    prediction = net(x)     # input x and predict based on x

    loss = loss_func(prediction, y)     # must be (1. nn output, 2. target)

    optimizer.zero_grad()   # clear gradients for next train
    loss.backward()         # backpropagation, compute gradients
    optimizer.step()        # apply gradients

    if t % 5 == 0:
        # plot and show learning process
        plt.cla()
        plt.scatter(x.data.numpy(), y.data.numpy())
        plt.plot(x.data.numpy(), prediction.data.numpy(), 'r-', lw=5)
        plt.text(0.5, 0, 'Loss=%.4f' % loss.data[0], fontdict={'size': 20, 'color':  'red'})
        plt.pause(0.1)

plt.ioff()
plt.show()



#下面我写一下输出的效果.代码几乎都相同只差一个bn添加没添加都只训练100步,看loss大小
#第一次:不加bn:0.0628  加:0.0038
## 第二次:      0.0568   加:0.0038

View Code

dnn手动加batch normalization

"""
View more, visit my tutorial page: https://morvanzhou.github.io/tutorials/
My Youtube Channel: https://www.youtube.com/user/MorvanZhou
Dependencies:
torch: 0.1.11
matplotlib
"""
import torch
from torch.autograd import Variable
from torch import nn
from torch.nn import init
import torch.utils.data as Data
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np

import torch
from torch.autograd import Variable
import torch.nn.functional as F
import matplotlib.pyplot as plt

# torch.manual_seed(1)    # reproducible

x = torch.unsqueeze(torch.linspace(-1, 1, 100), dim=1)  # x data (tensor), shape=(100, 1)
y = x.pow(2) + 0.2*torch.rand(x.size())                 # noisy y data (tensor), shape=(100, 1)

# torch can only train on Variable, so convert them to Variable
x, y = Variable(x), Variable(y)


# plt.scatter(x.data.numpy(), y.data.numpy())
# plt.show()


class Net(nn.Module):
    def __init__(self):
            super(Net, self).__init__()
              #对初始数据归一化
            
            self.diyi=nn.Linear(1, 100)
            self.dier=nn.Linear(100, 10) # if want same width and length of this image after con2d, padding=(kernel_size-1)/2 if stride=1
                                       # output shape (16, 28, 28)
            #batchnormld的参数是上一层的输出
            
            self.diyi1=nn.ReLU()
             # choose max value in 2x2 area, output shape (16, 14, 14)
            self.disan=nn.Linear(10, 1)


            self.ui=nn.Linear(100,100)
            self.ui2=nn.Linear(10,10)
    def forward(self, x):
            m = torch.mean(x, dim=0)#计算均值 注意是在batch_size这个dim上做mean.
            std = torch.std(x, dim=0)#计算标准差
            epsilon=0.0 #必须写的足够小才能归一化,我写0.01都不行,这个需要测试.
            x_normed = (x - m) / (std + epsilon)#归一化
            
            x=x_normed
            x=self.diyi(x)
            
            
            m = torch.mean(x, dim=0)#计算均值 注意是在batch_size这个dim上做mean.
            std = torch.std(x, dim=0)#计算标准差
            epsilon=0.0 #必须写的足够小才能归一化,我写0.01都不行,这个需要测试.
            x_normed = (x - m) / (std + epsilon)#归一化
            
            x=x_normed
            x=self.ui(x)
            x=self.diyi1(x)
            
            x=self.dier(x)
            

            m = torch.mean(x, dim=0)#计算均值 注意是在batch_size这个dim上做mean.
            std = torch.std(x, dim=0)#计算标准差
            epsilon=0.0 #必须写的足够小才能归一化,我写0.01都不行,这个需要测试.
            x_normed = (x - m) / (std + epsilon)#归一化
            
            x=x_normed
            x=self.ui2(x)
            x=self.diyi1(x)
            x=self.disan(x)
            
            
            
            
            return x

net = Net()     # define the network
print(net)  # net architecture

optimizer = torch.optim.SGD(net.parameters(), lr=0.05)
loss_func = torch.nn.MSELoss()  # this is for regression mean squared loss

plt.ion()   # something about plotting

for t in range(100):
    prediction = net(x)     # input x and predict based on x

    loss = loss_func(prediction, y)     # must be (1. nn output, 2. target)

    optimizer.zero_grad()   # clear gradients for next train
    loss.backward()         # backpropagation, compute gradients
    optimizer.step()        # apply gradients

    if t % 5 == 0:
        # plot and show learning process
        plt.cla()
        plt.scatter(x.data.numpy(), y.data.numpy())
        plt.plot(x.data.numpy(), prediction.data.numpy(), 'r-', lw=5)
        plt.text(0.5, 0, 'Loss=%.4f' % loss.data[0], fontdict={'size': 20, 'color':  'red'})
        plt.pause(0.1)

plt.ioff()
plt.show()




#下面我写一下输出的效果.代码几乎都相同只差一个bn添加没添加



#手动加没写明白,跑是跑了,效果这么差?
#后来经过我的改动,写出来了,原因就是SGD的理解不够,他里面第一个参数是
## net.parameters(),他表示他只训练网络里面的系数,这个系数指的是__init__里面的系数
## 不是forward里面的系数,所以forward里面的网络除了归一化以外的网络都需要在__init__
## 里面先生成这个网络模型,然后他的参数才能给SGD,然后才能进行学习,
#效果差不多,手动写的效果更好一点,可能是因为我的学习率改更小了,说明学习率还是很重要的
#即使你用了bn层也一定要调试好这个超参数

View Code

●实验楼的用python写2048

状态机的使用和curses模块来gui画图形和读取键盘输入.每一个状态转移后就画一次图形.

curses:在python2.7环境下通过wheel来安装.用anaconda可以自动识别新装入的2.7版本,然后打开2.7版本的终端来运行脚本即可.其实visual studio也

自动识别新安装的python 环境,可以在vs中切换.

#-*- coding:utf-8 -*-

import curses
from random import randrange, choice # generate and place new tile
from collections import defaultdict

letter_codes = [ord(ch) for ch in 'WASDRQwasdrq']
actions = ['Up', 'Left', 'Down', 'Right', 'Restart', 'Exit']
actions_dict = dict(zip(letter_codes, actions * 2))

def get_user_action(keyboard):    
    char = "N"
    while char not in actions_dict:    
        char = keyboard.getch()
    return actions_dict[char]

def transpose(field):
    return [list(row) for row in zip(*field)]

def invert(field):
    return [row[::-1] for row in field]

class GameField(object):
    def __init__(self, height=4, width=4, win=2048):
        self.height = height
        self.width = width
        self.win_value = win
        self.score = 0
        self.highscore = 0
        self.reset()

    def reset(self):
        if self.score > self.highscore:
            self.highscore = self.score
        self.score = 0
        self.field = [[0 for i in range(self.width)] for j in range(self.height)]
        self.spawn()
        self.spawn()

    def move(self, direction):
        def move_row_left(row):
            def tighten(row): # squeese non-zero elements together
                new_row = [i for i in row if i != 0]
                new_row += [0 for i in range(len(row) - len(new_row))]
                return new_row

            def merge(row):
                pair = False
                new_row = []
                for i in range(len(row)):
                    if pair:
                        new_row.append(2 * row[i])
                        self.score += 2 * row[i]
                        pair = False
                    else:
                        if i + 1 < len(row) and row[i] == row[i + 1]:
                            pair = True
                            new_row.append(0)
                        else:
                            new_row.append(row[i])
                assert len(new_row) == len(row)
                return new_row
            return tighten(merge(tighten(row)))

        moves = {}
        moves['Left']  = lambda field:                              
                [move_row_left(row) for row in field]
        moves['Right'] = lambda field:                              
                invert(moves['Left'](invert(field)))
        moves['Up']    = lambda field:                              
                transpose(moves['Left'](transpose(field)))
        moves['Down']  = lambda field:                              
                transpose(moves['Right'](transpose(field)))

        if direction in moves:
            if self.move_is_possible(direction):
                self.field = moves[direction](self.field)
                self.spawn()
                return True
            else:
                return False

    def is_win(self):
        return any(any(i >= self.win_value for i in row) for row in self.field)

    def is_gameover(self):
        return not any(self.move_is_possible(move) for move in actions)

    def draw(self, screen):
        help_string1 = '(W)Up (S)Down (A)Left (D)Right'
        help_string2 = '     (R)Restart (Q)Exit'
        gameover_string = '           GAME OVER'
        win_string = '          YOU WIN!'
        def cast(string):
            screen.addstr(string + '
')

        def draw_hor_separator():
            line = '+' + ('+------' * self.width + '+')[1:]
            separator = defaultdict(lambda: line)
            if not hasattr(draw_hor_separator, "counter"):
                draw_hor_separator.counter = 0
            cast(separator[draw_hor_separator.counter])
            draw_hor_separator.counter += 1

        def draw_row(row):
            cast(''.join('|{: ^5} '.format(num) if num > 0 else '|      ' for num in row) + '|')

        screen.clear()
        cast('SCORE: ' + str(self.score))
        if 0 != self.highscore:
            cast('HIGHSCORE: ' + str(self.highscore))
        for row in self.field:
            draw_hor_separator()
            draw_row(row)
        draw_hor_separator()
        if self.is_win():
            cast(win_string)
        else:
            if self.is_gameover():
                cast(gameover_string)
            else:
                cast(help_string1)
        cast(help_string2)

    def spawn(self):
        new_element = 4 if randrange(100) > 89 else 2
        (i,j) = choice([(i,j) for i in range(self.width) for j in range(self.height) if self.field[i][j] == 0])
        self.field[i][j] = new_element

    def move_is_possible(self, direction):
        def row_is_left_movable(row): 
            def change(i): # true if there'll be change in i-th tile
                if row[i] == 0 and row[i + 1] != 0: # Move
                    return True
                if row[i] != 0 and row[i + 1] == row[i]: # Merge
                    return True
                return False
            return any(change(i) for i in range(len(row) - 1))

        check = {}
        check['Left']  = lambda field:                              
                any(row_is_left_movable(row) for row in field)

        check['Right'] = lambda field:                              
                 check['Left'](invert(field))

        check['Up']    = lambda field:                              
                check['Left'](transpose(field))

        check['Down']  = lambda field:                              
                check['Right'](transpose(field))

        if direction in check:
            return check[direction](self.field)
        else:
            return False

def main(stdscr):
    def init():
        #重置游戏棋盘
        game_field.reset()
        return 'Game'

    def not_game(state):
        #画出 GameOver 或者 Win 的界面
        game_field.draw(stdscr)
        #读取用户输入得到action，判断是重启游戏还是结束游戏
        action = get_user_action(stdscr)
        responses = defaultdict(lambda: state) #默认是当前状态，没有行为就会一直在当前界面循环
        responses['Restart'], responses['Exit'] = 'Init', 'Exit' #对应不同的行为转换到不同的状态
        return responses[action]

    def game():
        #画出当前棋盘状态
        game_field.draw(stdscr)
        #读取用户输入得到action
        action = get_user_action(stdscr)

        if action == 'Restart':
            return 'Init'
        if action == 'Exit':
            return 'Exit'
        if game_field.move(action): # move successful
            if game_field.is_win():
                return 'Win'
            if game_field.is_gameover():
                return 'Gameover'
        return 'Game'


    state_actions = {
            'Init': init,
            'Win': lambda: not_game('Win'),
            'Gameover': lambda: not_game('Gameover'),
            'Game': game
        }

    curses.use_default_colors()

    # 设置终结状态最大数值为 32
    game_field = GameField(win=32)


    state = 'Init'

    #状态机开始循环
    while state != 'Exit':
        state = state_actions[state]()

curses.wrapper(main)

View Code

●可能会用到的trie树,dat树,倒排索引

#coding=utf-8  #字典嵌套牛逼,别人写的,这样每一层非常多的东西,搜索就快了,树高26.所以整体搜索一个不关多大的单词表
#还是O(1).
'''
Python 字典 setdefault() 函数和get() 方法类似, 如果键不存在于字典中，将会添加键并将值设为默认值。
说清楚就是:如果这个键存在字典中,那么这句话就不起作用,否则就添加字典里面这个key的取值为后面的默认值.
简化了字典计数的代码.并且这个函数的返回值是做完这些事情之后这个key的value值.
dict.setdefault(key, default=None)
Python 字典 get() 函数返回指定键的值，如果值不在字典中返回默认值。
dict.get(key, default=None)
'''
class Trie:  
    root = {}  
    END = '/'  #加入这个是为了区分单词和前缀,如果这一层node里面没有/他就是前缀.不是我们要找的单词.
    def add(self, word):  
        #从根节点遍历单词,char by char,如果不存在则新增,最后加上一个单词结束标志  
        node = self.root  
        for c in word:  
            node=node.setdefault(c,{})  #利用嵌套来做,一个trie树的子树也是一个trie树.
                                        #利用setdefault的返回值是value的特性,如果找到了key就进入value
                                        #没找到,就建立一个空字典然后
        node[self.END] = None          #当word都跑完了,就已经没有字了.那么当前节点也就是最后一个字母的节点
                                        #加一个属性标签end.这个end里面随意放一个value即可.因为我们判定只是
                                        #判定end这个key是否在字典里面.
                                        #考虑add 同一个单词2次的情况,第二次add 这个单词的时候,因为用setdefault
                                        #add里面的话都不对原字典进行修改.正好是我们需要的效果.
                                        
                                        #这个self.END很重要,可以作为信息来存储.比如里面可以输入这个单词的
                                        #起源,发音,拼写,词组等作为信息存进去.找这个单词然后读出单词的信息.
  
    def find(self, word):  
        node = self.root  
        for c in word:  
            if c not in node:  
                return False  
            node = node[c]  
        return self.END in node  
    def associate_find(self, pre):  #搜索引擎里面的功能是你输入东西,不关是不是单词,他都输出以这个东西为前缀
                                       #的单词.
        node = self.root  
        for c in pre:  
            if c not in node:  
                return []  #因为字典里面没有pre这个前缀
            node = node[c]  #有这个前缀就继续走,这里有个问题就是需要记录走过的路径才行.
        #运行到这里node就是最后一个字母所表示的字典.
#举一个栗子:图形就是{a,b,c}里面a的value是{b,c,d} d的value是{/,e,f} 那么/代表的单词就是ad,看这个形象多了
        #首先看这个字母所在的字典有没有END,返回a这个list

        

        #然后下面就是把前缀是pre的单词都加到a里面.
        #应该用广度遍历,深度遍历重复计算太多了.好像深度也很方便,并且空间开销很小.
        #广度不行,每一次存入node,没用的信息存入太多了.需要的信息只是这些key是什么,而不需要存入node.
        #但是深度遍历,又需要一个flag记录每个字母.字典的key又实现不了.
        #用函数递归来遍历:只能先用这个效率最慢的先写了
        #因为你遍历一直到底,到底一定是'/'和None.所以一定bianli出来的是单词不是中间结果.
        def bianli(node):#返回node节点和他子节点拼出的所有单词
            if node==None:
                return ['']
            a=[]#现在node是/ef
            
            for i in node:
                tmp=node[i]
                tmp2=bianli(tmp)
                for j in tmp2:

                  a.append(i+j)
            return a
        output=bianli(node)
        for i in range(len(output)):
            output[i]=(pre+output[i])[:-1]
        return output











    def delete(self, word):#字典中删除word
        node = self.root  
        for c in word:  
            if c not in node:  
                print('字典中没有不用删')
                return False  
            node = node[c]  
        #如果找到了就把'/'给他删了就行了
        del node['/']  
        #后面还需要检索一遍,找一下是否有前缀的后面没有单词的.把前缀的最后一个字母也去掉.因为没单词了,前缀也没意义存在了.
        #也就是说最后一个字母这个节点,只有'/',删完如果是空的就把这个节点也删了.
        while node=={}:
            if word=='':
                return 
            tmp=word[-1]
            word=word[:-1]
            node = self.root  
            for c in word:  
              node = node[c]
            del node[tmp]


a=Trie()

(Trie.END)#python这个也是吊,类方法和类属性:自动也是对象的方法或者属性!
a.add('apple')
a.add('appl')
a.delete('apple')

print(a.find('apple'))
print(a.root)#发现完美的解决了删除功能.删除apple因为没有其他单词了就把整个字典删了
#下面我打算加一个功能就是词汇联想功能,输入a,输出a,ab,abc.就是把a后面的字典里面的所有的单词就输出出来.


#两个字典的key相同,id就相同.真坑.用id区分不了2个取值相同的不同元素.
#my={'a':{}}
#print(type(my))
#my['a']={'a':{'/'}}
#for i in my:
#   print(id(i))
#   a=my[i]
#   for j in a:
#       print(id(j))

View Code

'''

ps:#a=[i*i for i in range(5) if i<3 ]  #python for if 的一行写法.
https://segmentfault.com/a/1190000008877595#articleHeader7
5.4.2 Base Array 的构造
看这里面写的还真不难,之前一直没看懂,是因为他数据没有显示写入.
其实还有一个数组用来写入数据.比如
这里面第一步之后的data数组变成了
data[2]='清'
data[3]='华'
data[7]='中'
这样通过他的步骤,做到最后就是3个数组,data,base,check3个数组来
表示这个2array trie.就能方便找到每一个词组了.
但是写起来简直吐血.

首先看最终得到的结果如何使用它来找到所有的词组:

字典:''清华”、“清华大学”、“清新”、“中华”、“华人”
编码:清-1，华-2，大-3，学-4，新-5，中-6，人-7

数组下表:0    1   2   3   4   5   6   7   8   9   10
base数组:1    空  3   2   2   3   6   2   3   2    6

1.使用:找清华:首先从base[0]出发.清在的位置是base[0]+code(清)=下表为2的地方
           清的base数组不是负数,说明有继续拓展的本事.所以找下一个词华可以找.
           华=他上一个节点的base值+code(华)=3+2=5.所以就找到了清华在我们字典里面存在
       找清华大学:上面华找到了,继续找大=base(华)+code(大)=5(注意是清华的华,所以是上面找到的3)+3=6
                  继续找学=base[6]+code(学)=10.所以清华大学找到了.
  继续细化:叶子节点的处理:将词的最后一个节点的转移基数统一改为某个负数
           所以 数组下表:0    1   2   3   4   5    6    7    8    9   10
                base数组:1    空  3   2   -2   -3   6   2   -3   -2    -6
          这样做的代价就是需要将状态转移函数base[s]+code(字符)改为|base[s]|+code(字符)
          重新跑一次清华:上来还是清=1+1=2   华=3+2=5  然后看base[5]=-3 ,所以可以到此结束来组成一个词汇.
          但是我们还可以继续跑
          来找清华大学:从华找大:大=|-3|+code(大)=6,base[6]不是负数,不能输出.
                      继续找学:学=6+4=10,他的base是-6.所以可以输出.
  加入check数组来解决bug:比如找'清中':找清我们到了3,找中我们到了9.base[9]=-2.所以我们输出'清中'是一个词汇.
                        这显然是错误的!所以我们要加入check数组来避免这种匹配.这种bug的原因是中这个词前面
                        不能是清这个字.用check数组来记录这个位置前面一个字符所在的index.
          所以 数组下表:0    1   2   3   4   5    6    7    8    9   10
               base数组:1    空  3   2   -2   -3   6   2   -3   -2    -6
               check  :-3   -1   0   0   7   2     5   0   2    3     6
               这样找清中:清是到了index2.判断check是不是清的上一个节点.是0(0表示根)没问题.
                         找中找到index9.然后需要判断check[9]是不是他过来的节点的index.发现一个是2,一个是3
                         所以不对.输出清中不存在.
2.搭建:
https://blog.csdn.net/kissmile/article/details/47417277
这个写的也是不错.但是他搭建的顺序有一点错误,按照层搭建,第五部分应该是搭建第一层的b后面的c节点.
逻辑基本就是这样,能讲清楚就不错了.基本达到智商110以上了.能代码实现感觉智商上150了.
因为比较复杂,还是先写伪代码.再实现.
                          

题目:建立字典:字典:''清华”、“清华大学”、“清新”、“中华”、“华人”
伪代码过程:
●a=[''清华”、“清华大学”、“清新”、“中华”、“华人”],b=sum([len(i) for i in a])
●对set(a)进行编码:清-1，华-2，大-3，学-4，新-5，中-6，人-7
●建立首字集合c:清,中,华
●为了数组足够长,建立base=[0]*b  check=[0]*b
●把c插入双数组,对base[0]赋予初值1.(其实赋予2也一样,貌似更好,因为初值1基本都会发生冲突,会降低建立速度)
 对新建立的base里面也放入1.
 把c插入后:
 数组下表:0    1   2   3   4   5   6   7   8   9   10
 base数组:1    0   1   1   0   0   0   1    0   0    0
 check  :0    0   0   0   0   0   0   0    0   0    0

●下面就是插入第二个字:华,新,华,人(第一个华,表示清后面的华,虽然他有2个但是前面都是清,所以只插入一个,这就是为什么
 Trie树节省空间的原因).
 下面插入清后面的字:有华和新(对于同一个字的后面的字要一起考虑,因为可能要修改这同一个的base数组)
 从2开始跑,华=base[2]+code(华)=3.冲突了因为3里面已经有了.
 所以base[2]+=1.这时再算华=4了.不冲突了.
 插入新又冲突了.所以清要继续加1.插入后的新元素base还是置1.(但是网上写的是置清现在的base值.我感觉没必要啊!!!!)
 也就是下图5,8我都置1,但是网上置的是3.(下面通过我的计算,我置1最后还是为了解决冲突而加到3了.
 难道置3能减少冲突的发生?问题是会不会空间浪费太多?)(利用树来看就是树的第n层的偏移量一定比第n-1层的至少一样或者多)
 (为什么?)(我认为是从概率上来讲,每一个字符边上的字符数量都一样,所以你上个字母需要偏移3个才能不冲突,
 你也至少需要偏移3个.减少代码运行时间.要知道处理冲突非常非常慢!!!!!)
 同时把check也更新了,也就是把清的index 2放进去.
 得到:
 
 数组下表:0    1   2   3   4   5   6   7   8   9   10
 base数组:1    0   3   1   0   1   0   1    1   0    0
 check  : 0    0   0   0   0   2   0   0    2   0    0
 
 
 (!!!!!!这里面就是遇到一个问题非常重要.搭建时候一定要多行一起搭建,也就是按照root的一层来搭建.把一层都弄好
 再弄下一层,原因就是我们最后需要得到的树是一个公共前缀只保存一次的树!也是问题的根本,不保持的话这个trie树
 完全没意义了,所以公共前缀保持同时处理,所以只能这样按照root的层来搭建才可以.)
 同理插入中后面的字:7的base+=1.得到:
 数组下表:0    1   2   3   4   5   6   7   8   9   10
 base数组:1    0   3   1   1   1   0   2    1   0    0
 check  : 0    0   0   0   7   2   0   0    2   0    0

 同理华人:得到:
 数组下表:0    1   2   3   4   5   6   7   8   9   10
 base数组:1    0   3   2   1   1   0   2    1   1    0
 check  : 0    0   0   0   7   2   0   0    2   3    0


 第三层.
 得到:
 数组下表:0    1   2   3   4   5   6   7   8   9   10
 base数组:1    0   3   2   1   3   1   2    1   1    0
 check  : 0    0   0   0   7   2   5   0    2   3    0

  第四层.
 得到:
 数组下表:0    1   2   3   4   5   6   7   8   9   10
 base数组:1    0   3   2   1   3   6   2    1   1    1
 check  : 0    0   0   0   7   2   5   0    2   3    6



 总结:难度不比红黑树简单.
'''
class DAT():
    def __init__(self,data):#通过这个函数返回self.base和self.check 2个数组
        #对data预处理:
        firststep=[]
        max_ceng=0#数据有多少层
        for i in data:
            a=0
            for j in i:
                firststep.append(j)
                a+=1
            if a>max_ceng:
                max_ceng=a
        all_len=len(firststep)
        mono_len=len(set(firststep))

        #用字典进行编码.用数组太慢了,因为数组里面搜索是O(N)
        bianma={}
        ma=1
        tmp=[]
        for i in firststep:#这里面去重,为了测试先这么写保顺序,写好后再改用set来加速
            if i not in tmp:
                tmp.append(i)
        for i in tmp:
            if i not in bianma:
               bianma[i]=ma
               ma+=1
        #我为了方便把''作为root,给他bianma 是0,然后base[0]=1
        bianma['']=0#只是为了递归写起来代码更简洁而已.自我感觉很简约.
        #初始化base 和check
        base=['#']*all_len  #虽然相同也不要用等号给check赋值base,因为list赋值是浅拷贝,传的是地址
        base[0]=1
        check=['#']*all_len
        #打印一下编码看看,因为字典是乱序的,每一次生成都不同,所以打印一下来验算自己做的对不对.
        print(bianma)
        self.bianma=bianma
        #开始建立:
        #建立是按照第一列,...,最后一列这个顺序进行递归的.
        #提取当前列的set后元素.
        #第一列可以看做''空字符开始的后面一个元素.
        #提取第一列:然后再递归修改成提取第i列
        

        before=''
        col_now=[i[len(before)] for i in data if before in i]#提取有before前缀的字符的下一个小字符.#第一层就是清,华,中
        tmp=[]
        for i in col_now:
            if i not in tmp:
                tmp.append(i)
        col_now=tmp
        print('第一列')
        print(col_now)
        #开始计算col_now里面的字符的base
        before_index=bianma[before]#其他层不是这么算的.
        now_layer_save_for_data=[]#为了下一层的递推而记录的文字信息
        now_layer_save_for_base=[]#为了下一层的递推而记录的index信息
        for i in col_now:
            
            while 1:
             index=base[before_index]+bianma[i]
             if base[index]=='#':#说明没有人占用
                 base[index]=base[before_index]
                 check[index]=before_index
                 now_layer_save_for_data.append(i)
                 now_layer_save_for_base.append(index)
                 break
             else:
                 base[before_index]+=1
        last_layer=1
        print('第一层')
        print(base)#测试后第一层建立成功.
        print(check)
        print(max_ceng)
        print(now_layer_save_for_data)
        print(now_layer_save_for_base)
        #还是先写递推的写法,递归的写法想不清楚.
        #建立layer信息
        layer1={}
        for i in range(len(data)):
          for jj in range(len(now_layer_save_for_data)):
            j=now_layer_save_for_data[jj]
            j2=now_layer_save_for_base[jj]#光用汉字来做key会发生无法区分清华,中华这种bug.
            if data[i][0]==j:
                layer1.setdefault((j,j2),[])
                layer1[(j,j2)].append(i)
        #用layer1,data里面的信息,对base里面信息进行加工,也就是如果单字就取反
        for i in layer1:
            if i[0] in data:
                base[i[1]]=-base[i[1]]




        #搭建第二层:先找到将要被搭建的字
        #利用last_layer和now_layer_save_for_data和now_layer_save_for_base来找.
        now_layer=last_layer+1
        
        #for i in range(len(now_layer_save_for_data)):
        #    tmp=now_layer_save_for_data[i]#tmp就是清
        #    id=now_layer_save_for_base[i]#id 就是清的base数组里面的值
            #找到清后面的字,也就是data里面第一个字为清的字.如果每建立一个节点就遍历一遍会是至少O(N方),并且
            #基本严格大于这个数字,太大了.我想法是一层的东西同时处理,这样一层只遍历一次.降到线性搜索.
            #对于同时一堆if,显然效率不行,所以还是字典来替代多if并列.还是慢,想到用类似线段树的手段来记录.
            #里面的每一层用一个字典来表示,一个value是一个list
        #根据layer1建立layer2
        layer=layer1
        print(layer)
        #下面就可以建立layer2了#从这里就能分析出为什么要把上一层有同一个前缀的都建立完再弄下一个.
        #下面整合起来是从一个layer得到这个层的全base数组和check数组.可以封装起来for循环.
        for iii in range(1,max_ceng):
            now_layer=iii+1
            layer4={}
            print(layer)  #layer1:{('清', 2): [0, 1, 2], ('中', 7): [3], ('华', 3): [4]}
            
            for i in layer:
                lastword=i[0]
                lastindex=i[1]
                beixuan=layer[i]
                #找到应该插入哪个
                charu=[]
                #把beixuan里面长度不够的剔除,他长度不够其实就表示已经在上一步是词组了.
                beixuan2=[]
                for i in beixuan :
                    if len(data[i])>=now_layer:
                        beixuan2.append(i)
                beixuan=beixuan2

                for i in beixuan:
                    newword=data[i][now_layer-1]
                    if newword not in charu:
                        charu.append(newword)
                #把charu里面的东西进入base,check算法中

                now_layer_save_for_data=[]#为了下一层的递推而记录的文字信息
                now_layer_save_for_base=[]#为了下一层的递推而记录的index信息
                col_now=charu #插入华,新
                before_index=abs(lastindex)
                for i in col_now:
            
                    while 1:
                     index=abs(base[before_index])+bianma[i]
                     if base[index]=='#':#说明没有人占用

                         break
                     else:
                         if base[before_index]>0:
                          base[before_index]+=1
                         else:
                             base[before_index]-=1
                         print(base)
                #对于已经构成词汇的词语base里面的数要取相反数.
                beixuanciku=[data[i][now_layer-1:] for i in beixuan]
            #调试状态vs2017把鼠标放变量上就能看到他的取值,很放方便.任意位置都能看
                for i in col_now:
                    if i in beixuanciku:
                        index=abs(base[before_index])+bianma[i]
                        base[index]=-abs(base[before_index])#注意这地方不能写-要写-abs
                        check[index]=before_index
                        now_layer_save_for_data.append(i)
                        now_layer_save_for_base.append(index)
                    else:
                        index=abs(base[before_index])+bianma[i]
                        base[index]=base[before_index]
                        check[index]=before_index
                        now_layer_save_for_data.append(i)
                        now_layer_save_for_base.append(index)
            

                #更新layer

                for i in beixuan:
                 for jj in range(len(now_layer_save_for_data)):
                    j=now_layer_save_for_data[jj]
                    j2=now_layer_save_for_base[jj]#光用汉字来做key会发生无法区分清华,中华这种bug.
                    if data[i][now_layer-1]==j:
                        layer4.setdefault((j,j2),[])
                        layer4[(j,j2)].append(i)


        #已经得到了新的layer4,替换回去,为了递推.
            layer=layer4
             
            











            
        #打印上个layer
        print(layer)     #{('清', 2): [0, 1, 2], ('中', 7): [3], ('华', 3): [4]} 上个layeer信息
        #下面需要更新layer
        layernew={}
        for i in layer:#逐个计算里面的对儿即可.比如先计算('清', 2): [0, 1, 2]应该改成什么
          pass


          #for jj in range(len(now_layer_save_for_data)):
          #  j=now_layer_save_for_data[jj]
          #  j2=now_layer_save_for_base[jj]#光用汉字来做key会发生无法区分清华,中华这种bug.
          #  if data[i][0]==j:
          #      layer1.setdefault((j,j2),[])
          #      layer1[(j,j2)].append(i)







        print(now_layer_save_for_data)
        print(now_layer_save_for_base)



        print('测试')#第二列也zhengque 
        #经过我2天超过20个小时的学习和写代码,写出了这个例子的base数组和check数组.修改些小bug就可以了.
        #绝逼不比红黑树简单.网上也几乎没有代码实现.因为我主题layer是从第一层建立后针对2到n层开始建立的
        #所以第一层如果是单字,直接返回这种情况,我还没写,但是相对盖起来简单.
        print(base)
        print(check)
        #最后的最后,用self把结果传出去
        self.base=base
        self.check=check
                     
        





                 


        


    def search(self,a):#通过这个函数a在data是否存在,这个函数随便玩了
        
        tmp=0
        #self写起来太麻烦,
        bianma=self.bianma
        base=self.base
        check=self.check
        i=a[0]
        if len(a)==1:
            tmp=1+bianma[i]
            return base[tmp]<0
        else:
            first=1+bianma[a[0]]
            for i in range(len(a)-1):
                tmp=abs(base[first])+bianma[a[i+1]]
                if check[tmp]!=first:
                    return False
                first=tmp
            return base[tmp]<0
        
'''
base:[1, '#', -3, 2, -2, -3, -6, 2, -3, -2, -6, '#', '#']
check:['#', '#', 0, 0, 7, 2, 5, 0, 2, 3, 6, '#', '#']
'''





#测试:
a=DAT(['清华','清华大学','清新','中华','华人','清'])
#进行search测试
print(a.search('清华大学'))
#经过测试,稍微大一点的数据也是能跑出来的.

View Code

'''
倒排索引:https://blog.csdn.net/okiwilldoit/article/details/51362839
'''
'''
先学分词系统jieba:pip install jieba即可.
'''

import jieba
s = u'我想和我女朋友一起去我的北京故宫博物院参观我的屋子和闲逛。'
cut = jieba.cut(s)

print ('output')
print (cut) #返回的是一个迭代器.这对大数据很叼.我的理解是因为
            #迭代器可以随时暂停,然后恢复继续上面的工作继续迭代
            #数据太大也可以跑跑停停不用一直等,可以随时看结果
print (','.join(cut))#居然花了10秒! 精准模式.也是最常用的,第二次跑有cache就用了0.8s
print (','.join(jieba.cut(s,cut_all = True)))#可见全模式就是把文本分成尽可能多的词。
print (','.join(jieba.cut_for_search(s)))#搜索引擎模式介于上面2者中间






#import jieba.posseg as psg
##下面一行返回的是分词后各个词性
##print ([(x.word,x.flag) for x in psg.cut(s)])#x是字符串的意思,这一行太卡了我就注释掉了,
#from collections import Counter#利用Counter这个容器可以返回频率top多少的词,但是速度很慢.
#c = Counter(s).most_common(20)
#print (c)








##下面学习用户字典:
#txt = u'欧阳建国是创新办主任也是欢聚时代公司云计算方面的专家'
#print (','.join(jieba.cut(txt)))

#jieba.load_userdict('user_dict.txt')#注意这个词典,在同目录下,然后编码用笔记本打开另存为utf-8即可.
#print (','.join(jieba.cut(txt)))#效果不错










'''
下面可以进行倒排索引了
'''


#不要的虚词:
_STOP_WORDS = frozenset([
    'a', 'about', 'above', 'above', 'across', 'after', 'afterwards', 'again',
    'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although',
    'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another',
    'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as',
    'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been',
    'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides',
    'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can',
    'cannot', 'cant', 'co', 'con', 'could', 'couldnt', 'cry', 'de', 'describe',
    'detail', 'do', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight',
    'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even',
    'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few',
    'fifteen', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former',
    'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get',
    'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her', 'here',
    'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him',
    'himself', 'his', 'how', 'however', 'hundred', 'ie', 'if', 'in', 'inc',
    'indeed', 'interest', 'into', 'is', 'it', 'its', 'itself', 'keep', 'last',
    'latter', 'latterly', 'least', 'less', 'ltd', 'made', 'many', 'may', 'me',
    'meanwhile', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly',
    'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'neither', 'never',
    'nevertheless', 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not',
    'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once', 'one', 'only',
    'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out',
    'over', 'own', 'part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 'same',
    'see', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'she',
    'should', 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some',
    'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere',
    'still', 'such', 'system', 'take', 'ten', 'than', 'that', 'the', 'their',
    'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby',
    'therefore', 'therein', 'thereupon', 'these', 'they', 'thickv', 'thin', 'third',
    'this', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus',
    'to', 'together', 'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 'two',
    'un', 'under', 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well',
    'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter',
    'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which',
    'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will',
    'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', 'yourself',
    'yourselves', 'the'])

import os
import jieba
import re
import sys



def word_index(text):
    words = word_split(text)
    words = words_cleanup(words)
    return words









def word_split(text):
    word_list = []
    pattern = re.compile(u'[u4e00-u9fa5]+')#提取中文,unicode编码u4e00-u9fa5的一个或者多个字符.

    jieba_list = list(jieba.cut(text))
    time = {}
    for  c in (jieba_list):

        if c in time:  # record appear time
            time[c] += 1
        else:
            time.setdefault(c, 0) #time第一次设为0是python自己语法这么定的,
            #text.index(c,t)返回text中c出现第t+1次的index.语法就是这么奇怪.
            #所以time里面写i表示这个单词出现i+1次,

        if pattern.search(c):  # if Chinese
            word_list.append((len(word_list), (text.index(c, time[c]), c)))
            continue
        if c.isalnum():  # if English or number
            word_list.append((len(word_list), (text.index(c, time[c]), c.lower())))  # include normalize

    return word_list

#先做单词的预处理,把上面分好的word_list给words_cleanup做筛选
def words_cleanup(words): #index 是单词在word_list中编号,offset是单词在text中下标,word就是单词.
    cleaned_words = []
    for index, (offset, word) in words:  # words-(word index for search,(letter offset for display,word))
        if word in _STOP_WORDS:
            continue
        cleaned_words.append((index, (offset, word)))
    return cleaned_words


def inverted_index(text):
    words = word_split(text)
    words = words_cleanup(words)
    inverted = {}

    for index, (offset, word) in words:#words就是洗完的数据.
        locations = inverted.setdefault(word, [])#把洗完的数据根据word重新归类.同一个word放一起
        locations.append((index, offset))

    return inverted #最后得到的就是倒查索引的结果这个字典.value是 第几个次,偏移量(text里面下标)
a=inverted_index('路上看到就发了开始的路路路路路路路')#a是倒排索引的字典,下面给这个字典加几个功能.都很简单

#对倒排索引继续添加功能:这个功能是把旧的inverted倒排字典跟新的doc_index倒排字典做合并.
#下面的方法就是继续加了一层索引是doc_id.这样倒排索引变成了
#key:word value:{doc_id1,....doc_idn} 然后每一个doc_idi对应一个列表
#列表中每一项是(index,offset)    这样就做成了一个完善的多文件系统中找关键字的位置的倒排索引
#使用的时候只需要从inverted={},用inverted_index_add往里面加即可.
def inverted_index_add(inverted, doc_id, doc_index):
    for word, locations in doc_index.items():
        indices = inverted.setdefault(word, {})
        indices[doc_id] = locations
    return inverted
from functools import reduce
def search(inverted, query):
    word = [word for _ , (offset, word) in word_index(query) if word in inverted][0]
    doc_set=inverted[word].keys() #doc_set 得到的是所有含query这个单词的文件编号.
    output=[]
    if doc_set:
        for doc in doc_set:#遍历所有的有效文档
            for a in inverted[word][doc]:
                #打印一点word左边的字符和右边的字符
                
                output.append((doc,a[0],word))
    return output




    




inverted={}
doc_index = inverted_index('我我的我的是我的我的')
a=inverted_index_add(inverted, 1, doc_index)
doc_index = inverted_index('我想和我女朋友一起去我的北京故宫博物院参观我的屋子和闲逛')
b=inverted_index_add(inverted, 2, doc_index)
print(inverted)
print(search(inverted,'我'))
#结果:[(1, 0, '我'), (1, 1, '我'), (1, 3, '我'), (1, 6, '我'), (1, 8, '我'), (2, 0, '我'), (2, 3, '我'), (2, 7, '我'), (2, 11, '我')]
#表示第一个文章第0个字符是我...............这样就做到了搜索引擎.给一个字或者词,他会返回他
#所在的哪篇文章中的第几个字符.
#具体还需要加入搜索结果的优先级.这样优先级高的写在前面给用户看.简单排序即可.实现.

View Code

●实验楼的识别验证码

#-*- coding:utf8 -*-
from PIL import Image
import numpy as np
im = Image.open("captcha.gif")
#(将图片转换为8位像素模式)
im.convert("L")

#打印颜色直方图
print (im.histogram())

his = im.histogram()
values = {}

for i in range(256):
    values[i] = his[i]

for j,k in sorted(values.items(),key=lambda x:x[1],reverse = True)[:10]:
    print (j,k)

#-*- coding:utf8 -*-
from PIL import Image
#下面是去噪.如果不知道需要的颜色是220和227怎么办?
im = Image.open("captcha.gif")
im.convert("P")
a=np.array(im)

im2 = Image.new("P",im.size,255)

for x in range(im.size[1]):
    for y in range(im.size[0]):
        pix = im.getpixel((y,x))
        if pix == 220 or pix == 227: # these are the numbers to get
            im2.putpixel((y,x),0)





a=np.array(im2)
print(a.shape)
#注意图片处理的长和宽:
#im2.size=多少个列*多少个行  a.shape=图片多少个行*多少个列 正好反过来!







inletter = False
foundletter=False
start = 0
end = 0

letters = []
#print(9999999)
#print(list(im2.getdata()))#返回像素值序列
#print(im2.size)


for y in range(im2.size[0]): 
    inletter=False
    for x in range(im2.size[1]):
        pix = im2.getpixel((y,x))
        if pix != 255:
            inletter = True
    if foundletter == False and inletter == True:
        foundletter = True
        start = y

    if foundletter == True and inletter == False:
        foundletter = False
        end = y
        letters.append((start,end))
    #最后得到的切分是im[start:end]即:不包含随后end列
    



from PIL import Image
import hashlib
import time
import os


import math

class VectorCompare:
    def magnitude(self,concordance):
        total = 0
        for word,count in concordance.items():
            total += count ** 2
        return math.sqrt(total)

    def relation(self,concordance1, concordance2):
        relevance = 0
        topvalue = 0
        for word, count in concordance1.items():
            if (word) in concordance2:
                topvalue += count * concordance2[word]
        #返回一个cos值.值越大就说明越接近.越小就说明夹角越趋近90度,不相关.
        return topvalue / (self.magnitude(concordance1) * self.magnitude(concordance2))



def buildvector(im):
    d1 = {}

    count = 0
    for i in im.getdata():
        d1[count] = i
        count += 1

    return d1

v = VectorCompare()


iconset = ['0','1','2','3','4','5','6','7','8','9','0','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']


imageset = []

for letter in iconset:
    for img in os.listdir('./iconset/%s/'%(letter)):
        temp = []
        if img != "Thumbs.db" and img != ".DS_Store": # windows check...
            temp.append(buildvector(Image.open("./iconset/%s/%s"%(letter,img))))
            
            imageset.append({letter:temp})#建立了很多个字典,每一个案例图片建立一个字典,这些字典都只有一个key,值
        #是一个数组,数组里面只有一个数据,一个buildvector函数的返回值一个字典.写这么多嵌套搞毛



for letter in letters:
    m = hashlib.md5()
    im3 = im2.crop(( letter[0] , 0, letter[1],im2.size[1] ))
    #im3是单个字符
    guess = []
    
    for image in imageset:
        

        for x,y in image.items():

                guess.append(( v.relation(y[0],buildvector(im3)),x) )


    guess.sort(reverse=True)
    print ("",guess[0])

View Code

●莫凡教python的pytorch课程:

做回归:

import torch
import matplotlib.pyplot as plt
#unsqueeze加一维,加dim=1.加一维到dim=1.所以把100的数组变成100*1的数组
x = torch.unsqueeze(torch.linspace(-1, 1, 100), dim=1)  # x data (tensor), shape=(100, 1)
y = x**2 + 0.2*torch.rand(x.size())                 # noisy y data (tensor), shape=(100, 1)

# 画图
#plt.scatter(x.data.numpy(), y.data.numpy())
#plt.show()




import torch
import torch.nn.functional as F     # 激励函数都在这

class Net(torch.nn.Module):  # 继承 torch 的 Module
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()     # 继承 __init__ 功能
        # 定义每层用什么样的形式
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # 隐藏层线性输出
        self.predict = torch.nn.Linear(n_hidden, n_output)   # 输出层线性输出

    def forward(self, x):   # 这同时也是 Module 中的 forward 功能
        # 正向传播输入值, 神经网络分析出输出值
        x = F.relu(self.hidden(x))      # 激励函数(隐藏层的线性值)
        x = self.predict(x)             # 输出值
        return x

net = Net(n_feature=1, n_hidden=10, n_output=1)

print(net)  # net 的结构
"""
Net (
  (hidden): Linear (1 -> 10)
  (predict): Linear (10 -> 1)
)
"""

# optimizer 是训练的工具
optimizer = torch.optim.SGD(net.parameters(), lr=0.2)  # 传入 net 的所有参数, 学习率
loss_func = torch.nn.MSELoss()      # 预测值和真实值的误差计算公式 (均方差)

for t in range(100):
    prediction = net(x)     # 喂给 net 训练数据 x, 输出预测值
    

    loss = loss_func(prediction, y)     # 计算两者的误差

    optimizer.zero_grad()   # 清空上一步的残余更新参数值
    loss.backward()         # 误差反向传播, 计算参数更新值
    optimizer.step()        # 将参数更新值施加到 net 的 parameters 上
    #画图
    if t % 5 == 0:
        # plot and show learning process
        plt.cla()
        plt.scatter(x.data.numpy(), y.data.numpy())
        plt.plot(x.data.numpy(), prediction.data.numpy(), 'r-', lw=5)
        plt.text(0.5, 0, 'Loss=%.4f' % loss.data.numpy(), fontdict={'size': 20, 'color':  'red'})
        plt.pause(0.1)
        plt.ion()   # 画图,连续的画图,所以就是输出一个动画
        plt.show()

View Code

分类:

import torch
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt



# 假数据
n_data = torch.ones(100, 2)         # 数据的基本形态
x0 = torch.normal(2*n_data, 1)      # 类型0 x data (tensor), shape=(100, 2)

y0 = torch.zeros(100)               # 类型0 y data (tensor), shape=(100, 1)

x1 = torch.normal(-2*n_data, 1)     # 类型1 x data (tensor), shape=(100, 2)

y1 = torch.ones(100)                # 类型1 y data (tensor), shape=(100, 1)

# 注意 x, y 数据的数据形式是一定要像下面一样 (torch.cat 是在合并数据)
x = torch.cat((x0, x1), ).type(torch.FloatTensor)  # FloatTensor = 32-bit floating

y = torch.cat((y0, y1), ).type(torch.LongTensor)    # LongTensor = 64-bit integer



import torch
import torch.nn.functional as F     # 激励函数都在这

class Net(torch.nn.Module):     # 继承 torch 的 Module
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()     # 继承 __init__ 功能
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # 隐藏层线性输出
        self.out = torch.nn.Linear(n_hidden, n_output)       # 输出层线性输出

    def forward(self, x):
        # 正向传播输入值, 神经网络分析出输出值
        x = F.relu(self.hidden(x))      # 激励函数(隐藏层的线性值)
        x = self.out(x)                 # 输出值, 但是这个不是预测值, 预测值还需要再另外计算
        return x

net = Net(n_feature=2, n_hidden=10, n_output=2) # 几个类别就几个 output

print(net)  # net 的结构
"""
Net (
  (hidden): Linear (2 -> 10)
  (out): Linear (10 -> 2)
)
"""


# optimizer 是训练的工具
optimizer = torch.optim.SGD(net.parameters(), lr=0.02)  # 传入 net 的所有参数, 学习率
# 算误差的时候, 注意真实值!不是! one-hot 形式的, 而是1D Tensor, (batch,)
# 但是预测值是2D tensor (batch, n_classes)
loss_func = torch.nn.CrossEntropyLoss()
plt.ion()   # 画图#这种开始连续画图,关闭连续画图都扔学习for循环的外面
for t in range(100):
    out = net(x)     # 喂给 net 训练数据 x, 输出分析值

    loss = loss_func(out, y)     # 计算两者的误差

    optimizer.zero_grad()   # 清空上一步的残余更新参数值
    loss.backward()         # 误差反向传播, 计算参数更新值
    optimizer.step()        # 将参数更新值施加到 net 的 parameters 上





    #下面是画图
    # 接着上面来
    if t % 2 == 0:
        plt.cla()
        # 过了一道 softmax 的激励函数后的最大概率才是预测值
        prediction = torch.max(F.softmax(out), 1)[1]



        pred_y = prediction.data.numpy().squeeze()
        target_y = y.data.numpy()
        plt.scatter(x.data.numpy()[:, 0], x.data.numpy()[:, 1], c=pred_y, s=100, lw=0, cmap='RdYlGn')
        accuracy = sum(pred_y == target_y)/200.  # 预测中有多少和真实值一样
        plt.text(1.5, -4, 'Accuracy=%.2f' % accuracy, fontdict={'size': 20, 'color':  'red'})
        plt.pause(0.1)
        
        plt.show()

plt.ioff()  # 停止画图# 画图#这种开始连续画图,关闭连续画图都扔学习for循环的外面
plt.show() #为了最后停下来,仍显示图片.所以继续写一个show

View Code

保存整个网络

import torch

torch.manual_seed(1)    # reproducible

# 假数据
x = torch.unsqueeze(torch.linspace(-1, 1, 100), dim=1)  # x data (tensor), shape=(100, 1)
y = x.pow(2) + 0.2*torch.rand(x.size())  # noisy y data (tensor), shape=(100, 1)

def save():
    # 建网络
    net1 = torch.nn.Sequential(
        torch.nn.Linear(1, 10),
        torch.nn.ReLU(),
        torch.nn.Linear(10, 1)
    )
    optimizer = torch.optim.SGD(net1.parameters(), lr=0.5)
    loss_func = torch.nn.MSELoss()

    # 训练
    for t in range(100):
        prediction = net1(x)
        loss = loss_func(prediction, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    torch.save(net1, 'net.pkl')  # 保存整个网络

net2=0
def restore_net():
    # restore entire net1 to net2
    global net2
    net2 = torch.load('net.pkl')
    prediction = net2(x)
    
# 保存 net1 ( 整个网络)
save()

# 提取整个网络
restore_net()
print(net2)

View Code

保存网络的参数就能重建网络

import torch

torch.manual_seed(1)    # reproducible

# 假数据
x = torch.unsqueeze(torch.linspace(-1, 1, 100), dim=1)  # x data (tensor), shape=(100, 1)
y = x.pow(2) + 0.2*torch.rand(x.size())  # noisy y data (tensor), shape=(100, 1)

def save():
    # 建网络
    net1 = torch.nn.Sequential(
        torch.nn.Linear(1, 10),
        torch.nn.ReLU(),
        torch.nn.Linear(10, 1)
    )
    optimizer = torch.optim.SGD(net1.parameters(), lr=0.5)
    loss_func = torch.nn.MSELoss()

    # 训练
    for t in range(100):
        prediction = net1(x)
        loss = loss_func(prediction, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    torch.save(net1.state_dict(), 'net_params.pkl')   # 只保存网络中的参数 (速度快, 占内存少)

net3=0
def restore_params():
    # 新建 net3
    global net3#重建的时候需要重新写一下网络的结构.所以这种记录参数的方法速度快,但是需要写的代码多一点.
    net3 = torch.nn.Sequential(
        torch.nn.Linear(1, 10),
        torch.nn.ReLU(),
        torch.nn.Linear(10, 1)
    )

    # 将保存的参数复制到 net3
    net3.load_state_dict(torch.load('net_params.pkl'))
    prediction = net3(x)
# 保存 net1 ( 整个网络)
save()

# 提取整个网络
restore_params()
print(net3)

View Code

batch训练模板的使用

#这个是torch 0.4版本的代码跟莫凡讲的有点不同,因为他的是旧版本.
#利用这个Data.TensorDataset结构来封装训练数据x,y之后只需要设置BATCH_SIZE即可.
#不用手动分组shuffle了.并且如果batch_size不被数据个数整除,那么就自动会在最后一组放入全部剩余数据.
#这也就是我们需要的效果

if __name__=='__main__':
    import torch
    import torch.utils.data as Data
    torch.manual_seed(1)    # reproducible

    BATCH_SIZE = 8      # 批训练的数据个数

    x = torch.linspace(1, 10, 10)       # x data (torch tensor)
    y = torch.linspace(10, 1, 10)       # y data (torch tensor)

    # 先转换成 torch 能识别的 Dataset

    torch_dataset = Data.TensorDataset(x, y)

    # 把 dataset 放入 DataLoader
    loader = Data.DataLoader(
        dataset=torch_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               # 要不要打乱数据 (打乱比较好)
        num_workers=2,              # 多线程来读数据
    )

    for epoch in range(3):   # 训练所有!整套!数据 3 次
        for step, (batch_x, batch_y) in enumerate(loader):  # 每一步 loader 释放一小批数据用来学习
            # 假设这里就是你训练的地方...

            #这里就暂时空白了,需要用的时候写上训练步奏即可.


            # 打出来一些数据
            print('Epoch: ', epoch, '| Step: ', step, '| batch x: ',
                  batch_x.numpy(), '| batch y: ', batch_y.numpy())

View Code

cnn跑mnist

import torch
import torch.nn as nn
import torch.utils.data as Data
import torchvision      # 数据库模块
import matplotlib.pyplot as plt

torch.manual_seed(1)    # reproducible

# Hyper Parameters
EPOCH = 1           # 训练整批数据多少次, 为了节约时间, 我们只训练一次
BATCH_SIZE = 50
LR = 0.001          # 学习率
DOWNLOAD_MNIST = False  # 如果你已经下载好了mnist数据就写上 Fasle,下载完就改False即可

#下载数据
# Mnist 手写数字
#下载train data
train_data = torchvision.datasets.MNIST(
    root='./mnist/',    # 保存或者提取位置
    train=True,  # this is training data
    transform=torchvision.transforms.ToTensor(),    # 转换 PIL.Image or numpy.ndarray 成
                                                    # torch.FloatTensor (C x H x W), 训练的时候 normalize 成 [0.0, 1.0] 区间
    download=DOWNLOAD_MNIST,          # 没下载就下载, 下载了就不用再下了
)
#下载test data
test_data = torchvision.datasets.MNIST(root='./mnist/', train=False)

# 批训练 50samples, 1 channel, 28x28 (50, 1, 28, 28)
train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)

# 为了节约时间, 我们测试时只测试前2000个
test_x = torch.unsqueeze(test_data.test_data, dim=1).type(torch.FloatTensor)[:2000]/255.   # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1)
test_y = test_data.test_labels[:2000]
print(test_y)





#wangluo
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        #数据进来就 卷积,relu,池化,卷积,relu,池化,就完了.最后的全连接层为了让最后输出10个数
        self.conv1 = nn.Sequential(  # input shape (1, 28, 28)
            nn.Conv2d(
                in_channels=1,      # input height #也就是这一层输入进去时候数据的神经元数,第一层显然是1
                out_channels=5,    # n_filters   #输出神经元数,当然越多越好,这里写16
                kernel_size=5,      # filter size
                stride=1,           # filter movement/step
                padding=2,      # 如果想要 con2d 出来的图片长宽没有变化, padding=(kernel_size-1)/2 当 stride=1
            ),      # output shape (5, 28, 28)
            nn.ReLU(),    # activation
            nn.MaxPool2d(kernel_size=2),    # 在 2x2 空间里向下采样, output shape (5, 14, 14)
        )
        self.conv2 = nn.Sequential(  # input shape (5, 14, 14)
            nn.Conv2d(5, 32, 5, 1, 2),  # output shape (32, 14, 14)  #把5继续深化到32
            nn.ReLU(),  # activation
            nn.MaxPool2d(2),  # output shape (32, 7, 7)
        )
        self.out = nn.Linear(32 * 7 * 7, 10)   # fully connected layer, output 10 classes

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = x.view(x.size(0), -1)   # 展平多维的卷积图成 (batch_size, 32 * 7 * 7) view就是reshape
        output = self.out(x)
        return output

cnn = CNN()
print(cnn)  # net architecture

optimizer = torch.optim.Adam(cnn.parameters(), lr=LR)   # optimize all cnn parameters
loss_func = nn.CrossEntropyLoss()   # the target label is not one-hotted

# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader):   # 分配 batch data, normalize x when iterate train_loader
        output = cnn(b_x)               # cnn output
        loss = loss_func(output, b_y)   # cross entropy loss
        optimizer.zero_grad()           # clear gradients for this training step
        loss.backward()                 # backpropagation, compute gradients
        optimizer.step()                # apply gradients
test_output = cnn(test_x[:10]) #test_output:10*10
print(test_output)
print(torch.max(test_output, 1))
#torch.max参数里面写1就是返回最大值所在的索引值.
pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
print(pred_y, 'prediction number')
print(test_y[:10].numpy(), 'real number')

View Code

lstm跑mnist

'''
Rnn与LSTM
rnn:是循环记忆.也就是学习第n句话是根据前面n-1句和这个第n句一起来决定这句话是什么意思.
    但是梯度爆炸或者梯度消失会使得rnn没法学习太久远的记忆.而有一些意思就是被久远的东西所决定.
    比如:英语喜欢把主谓宾放前面,修饰词都放后面.就很难学到关键信息
LSTM:长短时记忆.可以把前面记忆的东西加一个权重,越重要的东西权重越大,就是长时间记忆.
    设置对于重要记忆,梯度直接提取,不用bp反复的迭代,这样久远的重要东西仍然效果强力.(感觉自己写会很复杂)
'''

#LSTM跑mnist
import torch
from torch import nn
import torchvision
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.utils.data as Data
import torchvision      # 数据库模块
import matplotlib.pyplot as plt

torch.manual_seed(1)    # reproducible

# Hyper Parameters
EPOCH = 1           # 训练整批数据多少次, 为了节约时间, 我们只训练一次
BATCH_SIZE = 64
TIME_STEP = 28      # rnn 时间步数 / 图片高度
INPUT_SIZE = 28     # rnn 每步输入值 / 图片每行像素
LR = 0.01           # learning rate
DOWNLOAD_MNIST = True  # 如果你已经下载好了mnist数据就写上 Fasle


# Mnist 手写数字
train_data = torchvision.datasets.MNIST(
    root='./mnist/',    # 保存或者提取位置
    train=True,  # this is training data
    transform=torchvision.transforms.ToTensor(),    # 转换 PIL.Image or numpy.ndarray 成
                                                    # torch.FloatTensor (C x H x W), 训练的时候 normalize 成 [0.0, 1.0] 区间
    download=DOWNLOAD_MNIST,          # 没下载就下载, 下载了就不用再下了
)


test_data = torchvision.datasets.MNIST(root='./mnist/', train=False)

# 批训练 50samples, 1 channel, 28x28 (50, 1, 28, 28)
train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)

# 为了节约时间, 我们测试时只测试前2000个
test_x = torch.unsqueeze(test_data.test_data, dim=1).type(torch.FloatTensor)[:2000]/255.   # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1)
test_y = test_data.test_labels[:2000]





class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        #rnn理解图片是把图片看作一个28行的文件.一行有28个字符,也就是input_size=28,
        self.rnn = nn.LSTM(     # LSTM 效果要比 nn.RNN() 好多了
            input_size=28,      # 图片每行的数据像素点
            hidden_size=64,     # rnn hidden unit
            num_layers=1,       # 有几层 RNN layers
            batch_first=True,   # input & output 会是以 batch size 为第一维度的特征集 e.g. (batch, time_step, input_size)
        )

        self.out = nn.Linear(64, 10)    # 输出层

    def forward(self, x):
        # x shape (batch, time_step, input_size)
        # r_out shape (batch, time_step, output_size)
        # h_n shape (n_layers, batch, hidden_size)   LSTM 有两个 hidden states, h_n 是分线, h_c 是主线
        # h_c shape (n_layers, batch, hidden_size)


        r_out, (h_n, h_c) = self.rnn(x, None)   # None 表示 hidden state 会用全0的 state


        # 选取最后一个时间点的 r_out 输出,也就是图片的最后一行,因为他使用了所有图片信息,用他来
        #输出最有代表性,肯定效果最好
        # 这里 r_out[:, -1, :] 的值也是 h_n 的值
        #r_out:64*28*64
        out = self.out(r_out[:, -1, :])
        return out

rnn = RNN()
print(rnn)


optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all parameters
loss_func = nn.CrossEntropyLoss()   # the target label is not one-hotted

# training and testing
for epoch in range(EPOCH):
    for step, (x, b_y) in enumerate(train_loader):   # gives batch data
        b_x = x.view(-1, 28, 28)   # reshape x to (batch, time_step, input_size)

        output = rnn(b_x)               # rnn output
        loss = loss_func(output, b_y)   # cross entropy loss
        optimizer.zero_grad()           # clear gradients for this training step
        loss.backward()                 # backpropagation, compute gradients
        optimizer.step()                # apply gradients
test_output = rnn(test_x[:10].view(-1, 28, 28))
pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
print(pred_y, 'prediction number')
print(test_y[:10], 'real number')

View Code

rnn回归

import torch
from torch import nn
import numpy as np
import matplotlib.pyplot as plt

torch.manual_seed(1)    # reproducible

# Hyper Parameters
TIME_STEP = 10      # rnn time step / image height
INPUT_SIZE = 1      # rnn input size / image width
LR = 0.02           # learning rate
DOWNLOAD_MNIST = False  # set to True if haven't download the data


class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()

        self.rnn = nn.RNN(  # 这回一个普通的 RNN 就能胜任
            input_size=1,
            hidden_size=32,     # rnn hidden unit
            num_layers=1,       # 有几层 RNN layers
            batch_first=True,   # input & output 会是以 batch size 为第一维度的特征集 e.g. (batch, time_step, input_size)
        )
        self.out = nn.Linear(32, 1)

    def forward(self, x, h_state):  # 因为 hidden state 是连续的, 所以我们要一直传递这一个 state
        # x (batch, time_step, input_size)
        # h_state (n_layers, batch, hidden_size)
        # r_out (batch, time_step, output_size)
        r_out, h_state = self.rnn(x, h_state)   # h_state 也要作为 RNN 的一个输入

        outs = []    # 保存所有时间点的预测值
        for time_step in range(r_out.size(1)):    # 对每一个时间点计算 output
            outs.append(self.out(r_out[:, time_step, :]))
        return torch.stack(outs, dim=1), h_state


rnn = RNN()
print(rnn)




optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all rnn parameters
loss_func = nn.MSELoss()

h_state = None   # 要使用初始 hidden state, 可以设成 None

for step in range(100):
    start, end = step * np.pi, (step+1)*np.pi   # time steps
    # sin 预测 cos
    steps = np.linspace(start, end, 10, dtype=np.float32)
    x_np = np.sin(steps)    # float32 for converting torch FloatTensor
    y_np = np.cos(steps)

    x = torch.from_numpy(x_np[np.newaxis, :, np.newaxis])    # shape (batch, time_step, input_size)
    y = torch.from_numpy(y_np[np.newaxis, :, np.newaxis])

    prediction, h_state = rnn(x, h_state)   # rnn 对于每个 step 的 prediction, 还有最后一个 step 的 h_state
    # !!  下一步十分重要 !!
    h_state = h_state.data  # 要把 h_state 重新包装一下才能放入下一个 iteration, 不然会报错

    loss = loss_func(prediction, y)     # cross entropy loss
    optimizer.zero_grad()               # clear gradients for this training step
    loss.backward()                     # backpropagation, compute gradients
    optimizer.step()                    # apply gradients
    if step%5==0:
        print('loss:',loss)

View Code

自编码

import torch
import torch.nn as nn
import torch.utils.data as Data
import torchvision
import matplotlib.pyplot as plt
# 超参数
EPOCH = 1
BATCH_SIZE = 64
LR = 0.005
DOWNLOAD_MNIST = True   # 下过数据的话, 就可以设置成 False
N_TEST_IMG = 5          # 到时候显示 5张图片看效果, 如上图一

# Mnist digits dataset
train_data = torchvision.datasets.MNIST(
    root='./mnist/',
    train=True,                                     # this is training data
    transform=torchvision.transforms.ToTensor(),    # Converts a PIL.Image or numpy.ndarray to
                                                    # torch.FloatTensor of shape (C x H x W) and normalize in the range [0.0, 1.0]
    download=DOWNLOAD_MNIST,                        # download it if you don't have it
)

class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()

        # 压缩
        self.encoder = nn.Sequential(
            nn.Linear(28*28, 128),
            nn.Tanh(),
            nn.Linear(128, 64),
            nn.Tanh(),
            nn.Linear(64, 12),
            nn.Tanh(),
            nn.Linear(12, 3),   # 压缩成3个特征, 进行 3D 图像可视化
        )
        # 解压
        self.decoder = nn.Sequential(
            nn.Linear(3, 12),
            nn.Tanh(),
            nn.Linear(12, 64),
            nn.Tanh(),
            nn.Linear(64, 128),
            nn.Tanh(),
            nn.Linear(128, 28*28),
            nn.Sigmoid(),       # 激励函数让输出值在 (0, 1)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

autoencoder = AutoEncoder()


optimizer = torch.optim.Adam(autoencoder.parameters(), lr=LR)
loss_func = nn.MSELoss()
train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
for epoch in range(EPOCH):
    for step, (x, b_label) in enumerate(train_loader):
        b_x = x.view(-1, 28*28)   # batch x, shape (batch, 28*28)
        b_y = x.view(-1, 28*28)   # batch y, shape (batch, 28*28)

        encoded, decoded = autoencoder(b_x)

        loss = loss_func(decoded, b_y)      # mean square error
        optimizer.zero_grad()               # clear gradients for this training step
        loss.backward()                     # backpropagation, compute gradients
        optimizer.step()                    # apply gradients


# 要观看的数据
view_data = train_data.train_data[:200].view(-1, 28*28).type(torch.FloatTensor)/255.
encoded_data, _ = autoencoder(view_data)    # 提取压缩的特征值
fig = plt.figure(2)

from mpl_toolkits.mplot3d import Axes3D
ax = Axes3D(fig)    # 3D 图
# x, y, z 的数据值
X = encoded_data.data[:, 0].numpy()
Y = encoded_data.data[:, 1].numpy()
Z = encoded_data.data[:, 2].numpy()
values = train_data.train_labels[:200].numpy()  # 标签值
for x, y, z, s in zip(X, Y, Z, values):
    c = cm.rainbow(int(255*s/9))    # 上色
    ax.text(x, y, z, s, backgroundcolor=c)  # 标位子
ax.set_xlim(X.min(), X.max())
ax.set_ylim(Y.min(), Y.max())
ax.set_zlim(Z.min(), Z.max())
plt.show()

View Code

lstm 加入 bn,自动调整学习率, 梯度clip.为了以后修改方便

'''
Rnn与LSTM
rnn:是循环记忆.也就是学习第n句话是根据前面n-1句和这个第n句一起来决定这句话是什么意思.
    但是梯度爆炸或者梯度消失会使得rnn没法学习太久远的记忆.而有一些意思就是被久远的东西所决定.
    比如:英语喜欢把主谓宾放前面,修饰词都放后面.就很难学到关键信息
LSTM:长短时记忆.可以把前面记忆的东西加一个权重,越重要的东西权重越大,就是长时间记忆.
    设置对于重要记忆,梯度直接提取,不用bp反复的迭代,这样久远的重要东西仍然效果强力.(感觉自己写会很复杂)
'''

#LSTM跑mnist
import torch
from torch import nn
import torchvision
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.utils.data as Data
import torchvision      # 数据库模块
import matplotlib.pyplot as plt

torch.manual_seed(1)    # reproducible

# Hyper Parameters
EPOCH = 1           # 训练整批数据多少次, 为了节约时间, 我们只训练一次
BATCH_SIZE =64
TIME_STEP = 28      # rnn 时间步数 / 图片高度
INPUT_SIZE = 28     # rnn 每步输入值 / 图片每行像素
LR = 0.001           # learning rate
DOWNLOAD_MNIST = True  # 如果你已经下载好了mnist数据就写上 Fasle


# Mnist 手写数字
train_data = torchvision.datasets.MNIST(
    root='./mnist/',    # 保存或者提取位置
    train=True,  # this is training data
    transform=torchvision.transforms.ToTensor(),    # 转换 PIL.Image or numpy.ndarray 成
                                                    # torch.FloatTensor (C x H x W), 训练的时候 normalize 成 [0.0, 1.0] 区间
    download=DOWNLOAD_MNIST,          # 没下载就下载, 下载了就不用再下了
)


test_data = torchvision.datasets.MNIST(root='./mnist/', train=False)

# 批训练 50samples, 1 channel, 28x28 (50, 1, 28, 28)
train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)

# 为了节约时间, 我们测试时只测试前2000个
test_x = torch.unsqueeze(test_data.test_data, dim=1).type(torch.FloatTensor)[:2000]/255.   # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1)
test_y = test_data.test_labels[:2000]





class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        
        #rnn理解图片是把图片看作一个28行的文件.一行有28个字符,也就是input_size=28,
        self.rnn = nn.LSTM(     # LSTM 效果要比 nn.RNN() 好多了
            input_size=28,      # 图片每行的数据像素点
            hidden_size=120,     # rnn hidden unit
            num_layers=1,       # 有几层 RNN layers
            batch_first=True,   # input & output 会是以 batch size 为第一维度的特征集 e.g. (batch, time_step, input_size)
        )

        self.out = nn.Linear(120, 10)    # 输出层
        self.out2=nn.Linear(28,28)
    def forward(self, x):
        # x shape (batch, time_step, input_size)
        # r_out shape (batch, time_step, output_size)
        # h_n shape (n_layers, batch, hidden_size)   LSTM 有两个 hidden states, h_n 是分线, h_c 是主线
        # h_c shape (n_layers, batch, hidden_size)
        #bn层,自带的方法bath_normalization没看懂,自己手写一个吧
        m = torch.mean(x, dim=0)#计算均值 注意是在batch_size这个dim上做mean.
        std = torch.std(x, dim=0)#计算标准差
        epsilon=0.001 #必须写的足够小才能归一化,我写0.01都不行,这个需要测试.
        x_normed = (x - m) / (std + epsilon)#归一化
            
        x=x_normed
        x=self.out2(x)#这行是为了把bn的变化通过放射再拟合回去,保证数据的前后一致
        

        r_out, (h_n, h_c) = self.rnn(x, None)   # None 表示 hidden state 会用全0的 state


        # 选取最后一个时间点的 r_out 输出,也就是图片的最后一行,因为他使用了所有图片信息,用他来
        #输出最有代表性,肯定效果最好
        # 这里 r_out[:, -1, :] 的值也是 h_n 的值
        #r_out:64*28*64
        out = self.out(r_out[:, -1, :])
        #out:64*28*10
        
        return out

rnn = RNN()
print(rnn)

#https://www.cnblogs.com/bamtercelboo/p/7469005.html这篇调参方法设置的.加入L2惩罚项
optimizer = torch.optim.Adam(rnn.parameters(), lr=LR, weight_decay=1e-8)   # optimize all parameters
loss_func = nn.CrossEntropyLoss()   # the target label is not one-hotted

# training and testing










def adjust_learning_rate(optimizer, decay_rate=.9):
    for param_group in optimizer.param_groups:
        param_group['lr'] = param_group['lr'] * decay_rate




for epoch in range(EPOCH):
    for step, (x, b_y) in enumerate(train_loader):   # gives batch data
        #下面2行动态学习率,效果不是很明显
        if step%200==0:
            adjust_learning_rate(optimizer)
            
            
        b_x = x.view(-1, 28, 28)   # reshape x to (batch, time_step, input_size)

        output = rnn(b_x)               # rnn output
        loss = loss_func(output, b_y)   # cross entropy loss
        optimizer.zero_grad()           # clear gradients for this training step
        loss.backward()                 # backpropagation, compute gradients
        
        #进行梯度切分:为什么没有设置最小的?
        torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1,norm_type=4)
        
        
        optimizer.step()                # apply gradients
        if step%20==0:
            print(loss)
# test_output = rnn(test_x[:10].view(-1, 28, 28))
# pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze()
# print(pred_y, 'prediction number')
# print(test_y[:10], 'real number')

View Code

keras,lstm做时间序列分析模板:

#库包的安装:https://blog.csdn.net/yangqiang200608/article/details/78719568?locationNum=9&fps=1
#keras的文档:http://keras-cn.readthedocs.io/en/latest/layers/core_layer/#dense
#老外的文档:https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/
import numpy
import matplotlib.pyplot as plt
from pandas import read_csv
import numpy

#把老外的代码改成农行的小时做一下预测
# load the dataset
dataframe = read_csv(r'd:lstm_try.csv', usecols=[1], engine='python', skipfooter=3)
dataset = dataframe.values
dataset = dataset.astype('float32')
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error




#对数据进行时间扩充的函数
def create_dataset(dataset, look_back=3):
    dataX, dataY = [], []
    
    for i in range(len(dataset)-look_back):
        a = dataset[i:(i+look_back), 0] #pd里面取数据的方式,  0表示取第二个维度里面index是0的数据
        dataX.append(a)  #a是前2个历史数据
        dataY.append(dataset[i + look_back, 0]) #Y是当前数据
        #X
    return numpy.array(dataX), numpy.array(dataY)


'''
(array([[ 112.,  118.,  132.],
       [ 118.,  132.,  129.],
       [ 132.,  129.,  121.],
       [ 129.,  121.,  135.],
       [ 121.,  135.,  148.],
       [ 135.,  148.,  148.],
       [ 148.,  148.,  136.],
       [ 148.,  136.,  119.],
       [ 136.,  119.,  104.],
       [ 119.,  104.,  118.],
       [ 104.,  118.,  115.],
       [ 118.,  115.,  126.],
       [ 115.,  126.,  141.],
       [ 126.,  141.,  135.],
       [ 141.,  135.,  125.],
       [ 135.,  125.,  149.],
       [ 125.,  149.,  170.],
       [ 149.,  170.,  170.],
       [ 170.,  170.,  158.],
       [ 170.,  158.,  133.],
       [ 158.,  133.,  114.],
       [ 133.,  114.,  140.],
       [ 114.,  140.,  145.],
       [ 140.,  145.,  150.],
       [ 145.,  150.,  178.],
       [ 150.,  178.,  163.],
       [ 178.,  163.,  172.],
       [ 163.,  172.,  178.],
       [ 172.,  178.,  199.],
       [ 178.,  199.,  199.],
       [ 199.,  199.,  184.],
       [ 199.,  184.,  162.],
       [ 184.,  162.,  146.],
       [ 162.,  146.,  166.],
       [ 146.,  166.,  171.],
       [ 166.,  171.,  180.],
       [ 171.,  180.,  193.],
       [ 180.,  193.,  181.],
       [ 193.,  181.,  183.],
       [ 181.,  183.,  218.],
       [ 183.,  218.,  230.],
       [ 218.,  230.,  242.],
       [ 230.,  242.,  209.],
       [ 242.,  209.,  191.],
       [ 209.,  191.,  172.],
       [ 191.,  172.,  194.],
       [ 172.,  194.,  196.],
       [ 194.,  196.,  196.],
       [ 196.,  196.,  236.],
       [ 196.,  236.,  235.],
       [ 236.,  235.,  229.],
       [ 235.,  229.,  243.],
       [ 229.,  243.,  264.],
       [ 243.,  264.,  272.],
       [ 264.,  272.,  237.],
       [ 272.,  237.,  211.],
       [ 237.,  211.,  180.],
       [ 211.,  180.,  201.],
       [ 180.,  201.,  204.],
       [ 201.,  204.,  188.],
       [ 204.,  188.,  235.],
       [ 188.,  235.,  227.],
       [ 235.,  227.,  234.],
       [ 227.,  234.,  264.],
       [ 234.,  264.,  302.],
       [ 264.,  302.,  293.],
       [ 302.,  293.,  259.],
       [ 293.,  259.,  229.],
       [ 259.,  229.,  203.],
       [ 229.,  203.,  229.],
       [ 203.,  229.,  242.],
       [ 229.,  242.,  233.],
       [ 242.,  233.,  267.],
       [ 233.,  267.,  269.],
       [ 267.,  269.,  270.],
       [ 269.,  270.,  315.],
       [ 270.,  315.,  364.],
       [ 315.,  364.,  347.],
       [ 364.,  347.,  312.],
       [ 347.,  312.,  274.],
       [ 312.,  274.,  237.],
       [ 274.,  237.,  278.],
       [ 237.,  278.,  284.],
       [ 278.,  284.,  277.],
       [ 284.,  277.,  317.],
       [ 277.,  317.,  313.],
       [ 317.,  313.,  318.],
       [ 313.,  318.,  374.],
       [ 318.,  374.,  413.],
       [ 374.,  413.,  405.],
       [ 413.,  405.,  355.],
       [ 405.,  355.,  306.],
       [ 355.,  306.,  271.],
       [ 306.,  271.,  306.],
       [ 271.,  306.,  315.],
       [ 306.,  315.,  301.],
       [ 315.,  301.,  356.],
       [ 301.,  356.,  348.],
       [ 356.,  348.,  355.],
       [ 348.,  355.,  422.],
       [ 355.,  422.,  465.],
       [ 422.,  465.,  467.],
       [ 465.,  467.,  404.],
       [ 467.,  404.,  347.],
       [ 404.,  347.,  305.],
       [ 347.,  305.,  336.],
       [ 305.,  336.,  340.],
       [ 336.,  340.,  318.],
       [ 340.,  318.,  362.],
       [ 318.,  362.,  348.],
       [ 362.,  348.,  363.],
       [ 348.,  363.,  435.],
       [ 363.,  435.,  491.],
       [ 435.,  491.,  505.],
       [ 491.,  505.,  404.],
       [ 505.,  404.,  359.],
       [ 404.,  359.,  310.],
       [ 359.,  310.,  337.],
       [ 310.,  337.,  360.],
       [ 337.,  360.,  342.],
       [ 360.,  342.,  406.],
       [ 342.,  406.,  396.],
       [ 406.,  396.,  420.],
       [ 396.,  420.,  472.],
       [ 420.,  472.,  548.],
       [ 472.,  548.,  559.],
       [ 548.,  559.,  463.],
       [ 559.,  463.,  407.],
       [ 463.,  407.,  362.],
       [ 407.,  362.,  405.],
       [ 362.,  405.,  417.],
       [ 405.,  417.,  391.],
       [ 417.,  391.,  419.],
       [ 391.,  419.,  461.],
       [ 419.,  461.,  472.],
       [ 461.,  472.,  535.],
       [ 472.,  535.,  622.],
       [ 535.,  622.,  606.],
       [ 622.,  606.,  508.],
       [ 606.,  508.,  461.],
       [ 508.,  461.,  390.]], dtype=float32), array([ 129.,  121.,  135.,  148.,  148.,  136.,  119.,  104.,  118.,
        115.,  126.,  141.,  135.,  125.,  149.,  170.,  170.,  158.,
        133.,  114.,  140.,  145.,  150.,  178.,  163.,  172.,  178.,
        199.,  199.,  184.,  162.,  146.,  166.,  171.,  180.,  193.,
        181.,  183.,  218.,  230.,  242.,  209.,  191.,  172.,  194.,
        196.,  196.,  236.,  235.,  229.,  243.,  264.,  272.,  237.,
        211.,  180.,  201.,  204.,  188.,  235.,  227.,  234.,  264.,
        302.,  293.,  259.,  229.,  203.,  229.,  242.,  233.,  267.,
        269.,  270.,  315.,  364.,  347.,  312.,  274.,  237.,  278.,
        284.,  277.,  317.,  313.,  318.,  374.,  413.,  405.,  355.,
        306.,  271.,  306.,  315.,  301.,  356.,  348.,  355.,  422.,
        465.,  467.,  404.,  347.,  305.,  336.,  340.,  318.,  362.,
        348.,  363.,  435.,  491.,  505.,  404.,  359.,  310.,  337.,
        360.,  342.,  406.,  396.,  420.,  472.,  548.,  559.,  463.,
        407.,  362.,  405.,  417.,  391.,  419.,  461.,  472.,  535.,
        622.,  606.,  508.,  461.,  390.,  432.], dtype=float32))

'''


#做数据切分和处理

# fix random seed for reproducibility
numpy.random.seed(7)
#第一步需要标准化数据,让训练更快
# normalize the dataset
#http://sklearn.apachecn.org/cn/0.19.0/modules/preprocessing.html#preprocessing-normalization
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)
# split into train and test sets
#train_size = int(len(dataset) * 0.95)
#test_size = len(dataset) - train_size
#train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
#look_back = 200     
#trainX, trainY = create_dataset(train, look_back)
#testX, testY = create_dataset(test, look_back)
#我认为应该先用create_dataset切,然后再用数组切片切成train,test 才对.改成了下面的
look_back = 200  
a,b=create_dataset(dataset,look_back)
train_size = int(len(a) * 0.95)
test_size = len(a) - train_size
trainX=a[:train_size,:]
trainY=b[:train_size]
testX=a[train_size:,:]
testY=b[train_size:]














#下面把中间加一个维度,表示time_step.每一个时间片段是多长.这里设置1,trainX.shape[1]=look_back
trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = numpy.reshape(testX, (testX.shape[0], 1, testX.shape[1]))








import numpy
import matplotlib.pyplot as plt
from pandas import read_csv
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error











#下面是网络结构,和把数据fit给网络进行训练
# create and fit the LSTM network,keras写深度网络代码也太少了.
model = Sequential()
#不写activation 默认就是tanh
model.add(LSTM(50, input_shape=(1, look_back),dropout=0.5, recurrent_dropout=0.5))         #4表示输出的维度
#input_dim：输入维度，当使用该层为模型首层时，应指定该值（或等价的指定input_shape)
model.add(Dense(50, activation='tanh'))
model.add(Dense(1, activation='tanh'))

model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=0)
'''
verbose:日志显示,0为不在标准输出流输出日志信息,1为输出进度条记录,2为每个epoch输出一行记录
'''

# make predictions
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)



# invert predictions
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])

trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
print('Train Score: %.2f RMSE' % (trainScore))
wucha=abs(testY[0]-testPredict[:,0])/testY[0]
print(type(wucha))
print('绝对值误差百分比平均:'+str(wucha.sum()/len(wucha)))



testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
print('Test Score: %.2f RMSE' % (testScore))

View Code

# -*- coding: utf-8 -*-
"""
Created on Fri Jul 20 10:58:02 2018


@author: 张博
"""

#读取csv最稳的方法:
#f = open(r'C:Users张博Desktop展示old.csv')
#data = read_csv(f,header=None)





'''
画图模板:
from matplotlib import pyplot
data=[]
pyplot.plot(data,color='black')
pyplot.show()

'''



'''
获取当前时间:
import datetime
nowTime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')#现在
nowTime=((nowTime)[:-3])
print(nowTime)
'''


'''
写文件的模板
with open(r'c:/234/wucha.txt','w') as f:
      wucha=str(wucha)
      f.write(wucha)
'''



'''
手动加断电的方法:raise 
'''


# -*- coding: utf-8 -*-
"""
Created on Fri Jul 20 10:58:02 2018


@author: 张博
"""









# -*- coding: utf-8 -*-
"""
Created on Tue Jul 17 10:54:38 2018

@author: 张博
"""

# -*- coding: utf-8 -*-
"""
Created on Mon Jul 16 17:18:57 2018

@author: 张博
"""

# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""

#2018-07-23,22点54对学习率参数进行for循环来学习哪个最好RATE
for i in range((1)):
    
    import os
    os.environ['CUDA_VISIBLE_DEVICES'] = '0' #使用 GPU 0
    
    import tensorflow as tf
    from keras.backend.tensorflow_backend import set_session
    
    config = tf.ConfigProto()
    config.gpu_options.allocator_type = 'BFC' #A "Best-fit with coalescing" algorithm, simplified from a version of dlmalloc.
    config.gpu_options.per_process_gpu_memory_fraction = 1.
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))
    
    
    
    
    
    
    
    
    #老外的教程:非常详细,最后的多变量,多step模型应该是最终实际应用最好的模型了.也就是这个.py文件写的内容
    #https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/
    
    '''
    SUPER_PARAMETER:一般代码习惯把超参数写最开始位置,方便修改和查找
    '''
    EPOCH=100
    LOOK_BACK=24
    n_features = 3         #这个问题里面这个参数不用动,因为只有2个变量
    RATE=0.55
    shenjing=62
    n_hours = LOOK_BACK
    
    
    
    
    
    import pandas as pd
    
    from pandas import read_csv
    from datetime import datetime
    # load data
    def parse(x):
        return datetime.strptime(x, '%Y %m %d %H')
    data = read_csv(r'E:output_nonghangout2.csv')
    
    #应该把DD给删了,天数没用
    #切片和concat即可
    
    
    tmp1=data.iloc[:,2:3]
    tmp2=data.iloc[:,3]
    tmp3=data.iloc[:,1]

    data.to_csv('c:/234/out00000.csv')
    
#    for i in range(len(tmp3)):
#        if tmp3[i] in range(12,13):
#            tmp3[i]=1
#        if tmp3[i] in range(13,14):
#            tmp3[i]=2
#        else:
#            tmp3[i]=0


    #加一个预处理判断.判断数据奇异的点.
    #方法是:遍历一遍整个数据,如果这个点的数据比同时工作日或者周末的情况的mean的0.2还低
    #就说明这个点错了.用上面同比情况mean来替代.
    
    for i in range(len(data)):
        hour=data.iloc[i]['HH']
        week=data.iloc[i]['week']
        tmp56=data.query('HH == '+str(hour) +' and '+ 'week=='+str(week))
        tmp_sum=tmp56['Sum'].mean()
        
        if data.iloc[i]['Sum']< tmp_sum *0.4:
            data.iloc[i]['Sum']=tmp_sum 
            print('修改了如下行,因为他是异常点')
            print(i)
            
            
    
    #修改完毕


    tmp1=data.iloc[:,2:3]
    tmp2=data.iloc[:,3]
    tmp3=data.iloc[:,1]















    









    
    
    data=pd.concat([tmp2,tmp3,tmp1],axis=1)
    print(data)
    data.to_csv('c:/234/out00000.csv')
    
    
    #因为下面的模板是把预测值放在了第一列.所以对data先做一个变换.
    
    
    
    
    
    
    
    
    
    
    
    
    
    #data.to_csv('pollution.csv')
    
    
    
    
    
    
    from pandas import read_csv
    from matplotlib import pyplot
    # load dataset
    dataset = data
    values = dataset.values
    
    
    
    ## specify columns to plot
    #groups = [0, 1, 2, 3, 5, 6, 7]
    #i = 1
    
    
    from pandas import read_csv
    from matplotlib import pyplot
    # load dataset
    #dataset = read_csv('pollution.csv', header=0, index_col=0)
    ##print(dataset.head())
    #values = dataset.values
    # specify columns to plot
    #groups = [0, 1, 2, 3, 5, 6, 7]
    #i = 1
    # plot each column
    #pyplot.figure()
    #图中每一行是一个列数据的展现.所以一共有7个小图,对应7个列指标的变化.
    #for group in groups:
    #    pyplot.subplot(len(groups), 1, i)
    #    pyplot.plot(values[:, group])
    #    pyplot.title(dataset.columns[group], y=0.5, loc='right')
    #    i += 1
    ##pyplot.show()
    
    
    
    from math import sqrt
    from numpy import concatenate
    from matplotlib import pyplot
    from pandas import read_csv
    from pandas import DataFrame
    from pandas import concat
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import LabelEncoder
    from sklearn.metrics import mean_squared_error
    from keras.models import Sequential
    from keras.layers import Dense
    from keras.layers import LSTM
    # load dataset
    
    
    # integer encode direction
    #把标签标准化而已.比如把1,23,5,7,7标准化之后就变成了0,1,2,3,3
    #print('values')
    #print(values[:5])
    #encoder = LabelEncoder()
    #values[:,4] = encoder.fit_transform(values[:,4])
    ## ensure all data is float
    #values = values.astype('float32')
    #print('values_after_endoding')
    #numpy 转pd
    import pandas as pd
    #pd.DataFrame(values).to_csv('values_after_endoding.csv')
    #从结果可以看出来encoder函数把这种catogorical的数据转化成了数值类型,
    #方便做回归.
    #print(values[:5])
    # normalize features,先正规化.
    
    
    
    
    #这里面系数多尝试(0,1) (-1,1) 或者用其他正则化方法.
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaled = scaler.fit_transform(values)
    print('正规化之后的数据')
    
    pd.DataFrame(scaled).to_csv('values_after_normalization.csv')
    
    # frame as supervised learning
    
    
    
    
    # convert series to supervised learning
    #n_in:之前的时间点读入多少,n_out:之后的时间点读入多少.
    #对于多变量,都是同时读入多少.为了方便,统一按嘴大的来.
    #print('测试shift函数')
    #
    #df = DataFrame(scaled)
    #print(df)      # 从测试看出来shift就是数据同时向下平移,或者向上平移.
    #print(df.shift(2))
    def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
        n_vars = 1 if type(data) is list else data.shape[1]
        df = DataFrame(data)
        cols, names = [],[]
        # input sequence (t-n, ... t-1)
        for i in range(n_in, 0, -1):
            cols.append(df.shift(i))
            names += [('var%d(时间:t-%s)' % (j+1, i)) for j in range(n_vars)]
        # forecast sequence (t, t+1, ... t+n)
        for i in range(0, n_out):
            cols.append(df.shift(-i))
            if i == 0:
                names += [('var%d(时间:t)' % (j+1)) for j in range(n_vars)]
            else:
                names += [('var%d(时间:t+%d)' % (j+1, i)) for j in range(n_vars)]
        # put it all together
        agg = concat(cols, axis=1)
        agg.columns = names
        # drop rows with NaN values
        if dropnan:
            agg.dropna(inplace=True)
        return agg
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    #series_to_supervised函数把多变量时间序列的列拍好.
    
    reframed = series_to_supervised(scaled, LOOK_BACK, 1)
    
    # drop columns we don't want to predict
    #我们只需要预测var1(t)所以把后面的拍都扔了.
    
    
    
    
    
    
    
    
    # split into train and test sets
    values = reframed.values
    n_train_hours = int(len(scaled)*0.75)
    train = values[:n_train_hours, :]
    test = values[n_train_hours:, :]
    # split into input and outputs
    n_obs = n_hours * n_features
    train_X, train_y = train[:, :n_obs], train[:, -n_features]
    test_X, test_y = test[:, :n_obs], test[:, -n_features]
    #print(train_X.shape, len(train_X), train_y.shape)
    #print(test_X.shape, len(test_X), test_y.shape)
    #print(train_X)
    #print(9999999999999999)
    #print(test_X)
    
    
    
    
    
    
    
    
    
    
    
    
    #这里依然是用timesteps=1
    #从这个reshape可以看出来,之前的单变量的feature长度=look_back
    #                       现在的多变量feature长度=look_back*len(variables).就这一个区别.
    # reshape input to be 3D [samples, timesteps, features]
    train_X = train_X.reshape((train_X.shape[0], n_hours,n_features))
    test_X = test_X.reshape((test_X.shape[0], n_hours, n_features))
    
    #print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)
    
    '''
    网络结构比较小的时候，效率瓶颈在CPU与GPU数据传输，这个时候只用cpu会更快。
    网络结构比较庞大的时候，gpu的提速就比较明显了。
    
    
    :显存和内存一样,属于随机存储,关机后自动清空。
    '''
    
    
    
    print('开始训练')
    
    # design network
    model = Sequential()
    import keras
    from keras import regularizers
    
    from keras import optimizers
    
    import keras
    
    model.add(keras.layers.recurrent.LSTM(shenjing, input_shape=(train_X.shape[1], train_X.shape[2]),activation='tanh', 
                                         recurrent_activation='hard_sigmoid',
                                         kernel_initializer='random_uniform',
                                         bias_initializer='zeros',
                                         kernel_regularizer=regularizers.l2(0.01),
                                         recurrent_regularizer=regularizers.l2(0.01)
                                         , bias_regularizer=regularizers.l2(0.01), 
                                         dropout=0., recurrent_dropout=0.
                                         ,return_sequences=False))
    
    #model.add(Dense(60, activation='tanh',kernel_regularizer=regularizers.l2(0.01),
    #                bias_regularizer=regularizers.l2(0.01)))
    
    # returns a sequence of vectors of dimension 32
    #model.add(LSTM(32))  # return a single vector of dimension 32
    
    def schedule(epoch):
        rate=RATE
        if epoch<3:
            return 0.002  #开始学的快一点
        if epoch<10:
            return 0.001
        if epoch<15:
            return 0.001*0.5
        if epoch<20:
            return 0.001*rate
        if epoch<30:
            return 0.001*rate**2
        if epoch<70:
           return 0.001*rate**3
        else:
            return 0.001*rate**4
    
    
    
    
    
    #发现这个层是必须加的.
#    model.add(keras.layers.core.Dropout(0.2, noise_shape=None, seed=None))
    
    
    
    
    
    
    
    learning_rate=keras.callbacks.LearningRateScheduler(schedule)
    learning_rate2=keras.callbacks.ReduceLROnPlateau(factor=0.5)
    #input_dim：输入维度，当使用该层为模型首层时，应指定该值（或等价的指定input_shape)
    
#    model.add(Dense(1000,activation='tanh'),)
    model.add(Dense(1))
    
    #loss:mse,mae,mape,msle
    adam = optimizers.Adam(lr=0.001, clipnorm=.5)
    model.compile(loss='mape', optimizer=adam,metrics=['mae'])
    # fit network
    #参数里面写validation_data就不用自己手动predict了,可以直接画histrory图像了
    history = model.fit(train_X, train_y, epochs=EPOCH, batch_size=1,
                        validation_data=(test_X, test_y),
                        verbose=1, shuffle=False,callbacks=[learning_rate,
             learning_rate2], )
    # plot history
    #pyplot.plot(history.history['loss'], label='train')
    #pyplot.plot(history.history['val_loss'], label='test')
    #pyplot.legend()
    #pyplot.show()
    
    
    
    #训练好后直接做预测即可.
    # make a prediction
    yhat = model.predict(test_X)         #yhat 这个变量表示y上面加一票的数学符号
                                 #在统计学里面用来表示算法作用到test上得到的预测值
    test_X = test_X.reshape((test_X.shape[0], n_hours*n_features))
    # invert scaling for forecast
    
    
    
    
    
    
    
    #因为之前的scale是对初始数据做scale的,inverse回去还需要把矩阵的型拼回去.
    inv_yhat = concatenate((yhat, test_X[:, -(n_features-1):]), axis=1)
    inv_yhat = scaler.inverse_transform(inv_yhat)
    inv_yhat = inv_yhat[:,0]#inverse完再把数据扣出来.多变量这个地方需要的操作要多点
    # invert scaling for actual
    
    
    
    test_y = test_y.reshape((len(test_y), 1))
    inv_y = concatenate((test_y, test_X[:, -(n_features-1):]), axis=1)
    inv_y = scaler.inverse_transform(inv_y)
    inv_y = inv_y[:,0]
    
    
    
    
    
    
    
    
    
    with open(r'c:/234/inv_y.txt','w') as f:
          inv_y1=str(inv_y)
          f.write(inv_y1)
    with open(r'c:/234/inv_yhat.txt','w') as f:
          inv_yhat1=str(inv_yhat)
          f.write(inv_yhat1)
    
    
    
    # calculate RMSE
    rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
#    print('RATE:')
#    print(RATE)
    print('输出abs差百分比指标:')
    #这个污染指数还有0的.干扰非常大
    #print(inv_y.shape)
    #print(inv_yhat.shape)
    wucha=abs(inv_y-inv_yhat)/(inv_y)
    #print(wucha)
    '''
    下面把得到的abs百分比误差写到 文件里面
    '''

    #with open(r'c:/234/wucha.txt','w') as f:
    #      print(type(wucha))
    #      wucha2=list(wucha)
    #      wucha2=str(wucha2)
    #      f.write(wucha2)
    
    with open(r'c:/234/sumary.txt','a') as f:
          rate=str(RATE)
          f.write(rate+'，')
          shenjing=str(shenjing)
          f.write(shenjing)
          f.write(',')
          wucha2=wucha.mean()
          wucha2=str(wucha2)
          f.write(wucha2)
          f.write('.')
          f.write('
')
    
    
    wucha=wucha.mean()
    print(wucha)
    
    
    
    inv_y=inv_y
    inv_yhat=inv_yhat
    
    #print('Test RMSE: %.3f' % rmse)
    import numpy as np
    
    from matplotlib import pyplot
    pyplot.rcParams['figure.figsize'] = (20, 3) # 设置figure_size尺寸
    
    pyplot.rcParams['image.cmap'] = 'gray' # 
    pyplot.plot(inv_y,color='black',linewidth = 0.7)
    pyplot.plot(inv_yhat     ,color='red',linewidth = 0.7)
    
    pyplot.show()
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    '''
    获取当前时间:
    import datetime
    nowTime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')#现在
    nowTime=((nowTime)[:-3])
    print(nowTime)
    '''
    
    
    '''
    写文件的模板
    with open(r'c:/234/wucha.txt','w') as f:
          wucha=str(wucha)
          f.write(wucha)
    '''
    
    
    
    
    
    
    '''
    手动加断电的方法:raise NameError #这种加断点方法靠谱
    '''
    
    '''
    画图模板:
    import numpy as np
    
    from matplotlib import pyplot
    pyplot.rcParams['figure.figsize'] = (20, 3) # 设置figure_size尺寸
    
    pyplot.rcParams['image.cmap'] = 'gray' # 
    pyplot.plot(inv_y,color='black',linewidth = 0.7)
    
    
    pyplot.show()
    
    
    '''
    
    #读取csv最稳的方法:
    #f = open(r'C:Users张博Desktop展示old.csv')
    #data = read_csv(f,header=None)

View Code

实验楼的java计算器脚本:不是太懂

package ffffff;


import java.awt.*;
import java.awt.event.*;
import javax.swing.*;
import java.util.Vector;
import java.math.BigDecimal;

public class Calculator {

    // 操作数1，为了程序的安全，初值一定设置，这里我们设置为0。
    String str1 = "0"; 
    
    // 操作数2
    String str2 = "0"; 
    
    // 运算符
    String signal = "+"; 
    
    // 运算结果
    String result = "";

    // 以下k1至k2为状态开关
    
    // 开关1用于选择输入方向，将要写入str1或str2
    int k1 = 1;
    // 开关2用于记录符号键的次数，如果 k2>1 说明进行的是 2+3-9+8 这样的多符号运算
    int k2 = 1;
    // 开关3用于标识 str1 是否可以被清0 ，等于1时可以，不等于1时不能被清0
    int k3 = 1;
    // 开关4用于标识 str2 是否可以被清0
    int k4 = 1;
    // 开关5用于控制小数点可否被录入，等于1时可以，不为1时，输入的小数点被丢掉
    int k5 = 1;
    // store的作用类似于寄存器，用于记录是否连续按下符号键
    JButton store; 
    
    @SuppressWarnings("rawtypes")
    Vector vt = new Vector(20, 10);

    // 声明各个UI组件对象并初始化
    JFrame frame = new JFrame("Calculator");
    JTextField result_TextField = new JTextField(result, 20);
    JButton clear_Button = new JButton("Clear");
    JButton button0 = new JButton("0");
    JButton button1 = new JButton("1");
    JButton button2 = new JButton("2");
    JButton button3 = new JButton("3");
    JButton button4 = new JButton("4");
    JButton button5 = new JButton("5");
    JButton button6 = new JButton("6");
    JButton button7 = new JButton("7");
    JButton button8 = new JButton("8");
    JButton button9 = new JButton("9");
    JButton button_Dian = new JButton(".");
    JButton button_jia = new JButton("+");
    JButton button_jian = new JButton("-");
    JButton button_cheng = new JButton("*");
    JButton button_chu = new JButton("/");
    JButton button_dy = new JButton("=");

    // 计算机类的构造器
    public Calculator() {
    
        // 为按钮设置等效键，即可以通过对应的键盘按键来代替点击它
        button0.setMnemonic(KeyEvent.VK_0);
        // 其它等效键省略，你可以自行补充完整

        // 设置文本框为右对齐，使输入和结果都靠右显示
        result_TextField.setHorizontalAlignment(JTextField.RIGHT);

        // 将UI组件添加进容器内
        JPanel pan = new JPanel();
        pan.setLayout(new GridLayout(4, 4, 5, 5));
        pan.add(button7);
        pan.add(button8);
        pan.add(button9);
        pan.add(button_chu);
        pan.add(button4);
        pan.add(button5);
        pan.add(button6);
        pan.add(button_cheng);
        pan.add(button1);
        pan.add(button2);
        pan.add(button3);
        pan.add(button_jian);
        pan.add(button0);
        pan.add(button_Dian);
        pan.add(button_dy);
        pan.add(button_jia);
        pan.setBorder(BorderFactory.createEmptyBorder(5, 5, 5, 5));

        JPanel pan2 = new JPanel();
        pan2.setLayout(new BorderLayout());
        pan2.add(result_TextField, BorderLayout.WEST);
        pan2.add(clear_Button, BorderLayout.EAST);

        // 设置主窗口出现在屏幕上的位置
        frame.setLocation(300, 200);
        // 设置窗体不能调大小
        frame.setResizable(false); 
        frame.getContentPane().setLayout(new BorderLayout());
        frame.getContentPane().add(pan2, BorderLayout.NORTH);
        frame.getContentPane().add(pan, BorderLayout.CENTER);

        frame.pack();
        frame.setVisible(true);

        // 事件处理程序

        // 数字键
        class Listener implements ActionListener {
            @SuppressWarnings("unchecked")
            public void actionPerformed(ActionEvent e) {
                String ss = ((JButton) e.getSource()).getText();
                store = (JButton) e.getSource();
                vt.add(store);
                if (k1 == 1) {
                    if (k3 == 1) {
                        str1 = "";
                        
                        // 还原开关k5状态
                        k5 = 1;
                    }
                    str1 = str1 + ss;

                    k3 = k3 + 1;
                    
                    // 显示结果
                    result_TextField.setText(str1);

                } else if (k1 == 2) {
                    if (k4 == 1) {
                        str2 = "";
                        
                        // 还原开关k5状态
                        k5 = 1; 
                    }
                    str2 = str2 + ss;
                    k4 = k4 + 1;
                    result_TextField.setText(str2);
                }

            }
        }

        // 输入的运算符号的处理
        class Listener_signal implements ActionListener {
            @SuppressWarnings("unchecked")
            public void actionPerformed(ActionEvent e) {
                String ss2 = ((JButton) e.getSource()).getText();
                store = (JButton) e.getSource();
                vt.add(store);

                if (k2 == 1) {
                    // 开关 k1 为 1 时向数 1 写输入值，为2时向数2写输入值。
                    k1 = 2;
                    k5 = 1;
                    signal = ss2;
                    k2 = k2 + 1;// 按符号键的次数
                } else {
                    int a = vt.size();
                    JButton c = (JButton) vt.get(a - 2);

                    if (!(c.getText().equals("+"))
                            && !(c.getText().equals("-"))
                            && !(c.getText().equals("*"))
                            && !(c.getText().equals("/")))

                    {
                        cal();
                        str1 = result;
                        // 开关 k1 为 1 时，向数 1 写值，为2时向数2写
                        k1 = 2;
                        k5 = 1;
                        k4 = 1;
                        signal = ss2;
                    }
                    k2 = k2 + 1;

                }

            }
        }

        // 清除键的逻辑（Clear）
        class Listener_clear implements ActionListener {
            @SuppressWarnings("unchecked")
            public void actionPerformed(ActionEvent e) {
                store = (JButton) e.getSource();
                vt.add(store);
                k5 = 1;
                k2 = 1;
                k1 = 1;
                k3 = 1;
                k4 = 1;
                str1 = "0";
                str2 = "0";
                signal = "";
                result = "";
                result_TextField.setText(result);
                vt.clear();
            }
        }

        // 等于键的逻辑
        class Listener_dy implements ActionListener {
            @SuppressWarnings("unchecked")
            public void actionPerformed(ActionEvent e) {

                store = (JButton) e.getSource();
                vt.add(store);
                cal();
                
                // 还原各个开关的状态
                k1 = 1; 
                k2 = 1;
                k3 = 1;
                k4 = 1;

                str1 = result; 
            }
        }
        
        // 小数点的处理
        class Listener_xiaos implements ActionListener {
            @SuppressWarnings("unchecked")
            public void actionPerformed(ActionEvent e) {
                store = (JButton) e.getSource();
                vt.add(store);
                if (k5 == 1) {
                    String ss2 = ((JButton) e.getSource()).getText();
                    if (k1 == 1) {
                        if (k3 == 1) {
                            str1 = "";
                            // 还原开关k5状态
                            k5 = 1; 
                        }
                        str1 = str1 + ss2;

                        k3 = k3 + 1;

                        // 显示结果
                        result_TextField.setText(str1);

                    } else if (k1 == 2) {
                        if (k4 == 1) {
                            str2 = "";
                            // 还原开关k5的状态
                            k5 = 1;
                        }
                        str2 = str2 + ss2;

                        k4 = k4 + 1;

                        result_TextField.setText(str2);
                    }
                }

                k5 = k5 + 1;
            }
        }

        // 注册各个监听器，即绑定事件响应逻辑到各个UI组件上
        Listener_dy jt_dy = new Listener_dy();
        
        // 监听数字键
        Listener jt = new Listener();
        // 监听符号键
        Listener_signal jt_signal = new Listener_signal();
        // 监听清除键
        Listener_clear jt_c = new Listener_clear(); 
        // 监听小数点键
        Listener_xiaos jt_xs = new Listener_xiaos();

        button7.addActionListener(jt);
        button8.addActionListener(jt);
        button9.addActionListener(jt);
        button_chu.addActionListener(jt_signal);
        button4.addActionListener(jt);
        button5.addActionListener(jt);
        button6.addActionListener(jt);
        button_cheng.addActionListener(jt_signal);
        button1.addActionListener(jt);
        button2.addActionListener(jt);
        button3.addActionListener(jt);
        button_jian.addActionListener(jt_signal);
        button0.addActionListener(jt);
        button_Dian.addActionListener(jt_xs);
        button_dy.addActionListener(jt_dy);
        button_jia.addActionListener(jt_signal);
        clear_Button.addActionListener(jt_c);

        // 窗体关闭事件的响应程序
        frame.addWindowListener(new WindowAdapter() {
            public void windowClosing(WindowEvent e) {
                System.exit(0);
            }
        });

    }

    // 计算逻辑
    public void cal() {
        // 操作数1
        double a2;
        // 操作数2
        double b2;
        // 运算符
        String c = signal;
        // 运算结果
        double result2 = 0;

        if (c.equals("")) {
            result_TextField.setText("Please input operator");

        } else {
            // 手动处理小数点的问题
            if (str1.equals("."))
                str1 = "0.0";
            if (str2.equals("."))
                str2 = "0.0";
            a2 = Double.valueOf(str1).doubleValue();
            b2 = Double.valueOf(str2).doubleValue();

            if (c.equals("+")) {
                result2 = a2 + b2;
            }
            if (c.equals("-")) {
                result2 = a2 - b2;
            }
            if (c.equals("*")) {
                BigDecimal m1 = new BigDecimal(Double.toString(a2));
                    BigDecimal m2 = new BigDecimal(Double.toString(b2));
                    result2 = m1.multiply(m2).doubleValue();
            }
            if (c.equals("/")) {
                if (b2 == 0) {
                    result2 = 0;
                } else {
                    result2 = a2 / b2;
                }

            }

            result = ((new Double(result2)).toString());

            result_TextField.setText(result);
        }
    }

    @SuppressWarnings("unused")
    public static void main(String[] args) {
        // 设置程序显示的界面风格，可以去除
    try {
            UIManager.setLookAndFeel("javax.swing.plaf.metal.MetalLookAndFeel");
        } catch (Exception e) {
            e.printStackTrace();
        }
        Calculator cal = new Calculator();
    }

}

View Code

增强学习的学习:

1. pip install gym

可以看到，增强学习和监督学习的区别主要有以下两点：

1. 增强学习是试错学习(Trail-and-error)，由于没有直接的指导信息，智能体要以不断与环境进行交互，通过试错的方式来获得最佳策略。

2. 延迟回报，增强学习的指导信息很少，而且往往是在事后（最后一个状态）才给出的，这就导致了一个问题，就是获得正回报或者负回报以后，如何将回报分配给前面的状态。

上篇我们提到增强学习学到的是一个从环境状态到动作的映射（即行为策略），记为策略π: S→A。而增强学习往往又具有延迟回报的特点: 如果在第n步输掉了棋，那么只有状态s_n和动作a_n获得了立即回报r(s_n,a_n)=-1，前面的所有状态立即回报均为0。所以对于之前的任意状态s和动作a，立即回报函数r(s,a)无法说明策略的好坏。因而需要定义值函数(value function，又叫效用函数)来表明当前状态下策略π的长期影响。

价值函数:第二行有错误多写了一个γ.

当一个策略取定,就是说Si,ai 这个数组取定.那么拟合下面这个等比函数.

其中r_i表示未来第i步回报，

PageRank算法简介

http://blog.jobbole.com/71431/

1.基本方法是矩阵不停的乘法,一直到收敛

2.利用加个概率来更改迭代公式即可解决终止点问题和陷阱问题.

重新学机器学习:

https://blog.csdn.net/supercally/article/details/54754787

https://blog.csdn.net/Young_Gy/article/details/73485518

这里面的max是对a'来取得.因为s'已经取定了.

●对于bellman公式的理解.

这个Q表是2维的,行表示s取值,列表示a取值.

为什么是这个公式:r表示原来的Q(s,a),max(Q(s',a'))表示s'状态的最大价值.用这个价值来表示s'状态的价值.(这么做是合理的,

因为我们的决策是每一步按照百分之90的概率选最大的action.所以这个最大价值来替换价值是几乎必然事件).

所以他也就是Q(s,a)这一步走完会获得的reward.

●代码和理解:

View Code

● 用dnn跑nlp 准确率0.85

# -*- coding: utf-8 -*-
"""
Created on Fri Jul 20 10:58:02 2018

#如果跑不了就是编码问题,用记事本另存一下,把编码改成utf-8保存即可.
#利用cmd跑这种画图程序,就能旋转图片了.spyder不能旋转
@author: 张博
"""




'''
读取csv最稳的方法:
f=open(r'C:/Users/old.csv')
data = read_csv(f,header=None)



'''


'''
画图模板:
from matplotlib import pyplot
data=[]
pyplot.plot(data,color='black')
pyplot.show()

'''



'''
获取当前时间:
import datetime
nowTime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')#现在
nowTime=((nowTime)[:-3])
print(nowTime)
'''


'''
写文件的模板
with open(r'c:/234/wucha.txt','w') as f:
      wucha=str(wucha)
      f.write(wucha)
'''



'''
手动加断电的方法:raise 
'''


'''
excel表格实用技巧:
全选然后选开始-行和列-最适合的列宽.
这样看表格清晰多了!
'''


'''
时间序列画图
from matplotlib import pyplot
#画布大小
pyplot.rcParams['figure.figsize'] = (300, 3) # 设置figure_size尺寸
import matplotlib.dates as mdates
ax=plt.gca()
pyplot.rcParams['image.cmap'] = 'gray' # 
xfmt = mdates.DateFormatter('%y-%m-%d %H:%M')
ax.xaxis.set_major_formatter(xfmt)
#下面这个是时间序列的间隔时间
plt.xticks(pd.date_range(data[0][0],data[-1][0],freq='2H'),rotation=90)
#样式
pyplot.plot(data[:,0],data[:,1],color='red',linewidth = 0.7)

pyplot.show()
'''



'''
#画3d
import  matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D



from matplotlib import pyplot
pyplot.rcParams['figure.figsize'] = (3, 3) # 设置figure_size尺寸


fig=plt.figure()

ax3d=Axes3D(fig)    #绘制3D图形
ax3d.scatter(data[:,1],data[:,2],data[:,0],c='r',marker=".")

pyplot.show()
'''


'''
非数值编码
#编码
from sklearn.preprocessing import LabelEncoder

a=a.values #切换成ndarry

encoder = LabelEncoder()
for i in range(5):
 a[:,i] = encoder.fit_transform(a[:,i])
'''



'''
#标准化
from sklearn.preprocessing import MinMaxScaler



scaler = MinMaxScaler(feature_range=(0, 1))
a = scaler.fit_transform(a)
print(a)
'''




'''
降维可视化:
    
tSNE的核心思想是：把高维数据看做高维空间中的点xixi，然后用流形方法将其映射
到低维空间中的yiyi，要求保持其空间距离。即高维空间相距较近/远的点，
映射到低维空间中仍然较近/远。为了让挨得近的点靠的更近，距离用高斯函数来度量

from sklearn.manifold import TSNE
#data可以是多少维都可以,都能降成2维
tsne=TSNE()
tsne.fit_transform(data)  #进行数据降维,降成两维
#a=tsne.fit_transform(data_zs) #a是一个array,a相当于下面的tsne_embedding_
tsne=pd.DataFrame(tsne.embedding_) #转换数据格式

print(tsne)

tsne['聚类类别']=label_pred
print(tsne)
d=tsne[tsne[u'聚类类别']==0]
plt.plot(d[0],d[1],'r.')
 
d=tsne[tsne[u'聚类类别']==1]
plt.plot(d[0],d[1],'go')

d=tsne[tsne[u'聚类类别']==2]
plt.plot(d[0],d[1],'b*')

d=tsne[tsne[u'聚类类别']==3]
plt.plot(d[0],d[1],'y+')
plt.show()


'''


'''
下面我们用dnn来解决nlp问题
'''


# 探索一下数据情况
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np
from keras.datasets import imdb
from keras.layers import Embedding, Flatten, Dense
from keras.models import Sequential
from keras.preprocessing import sequence
#查询这个文档https://keras.io/datasets/
#'''
#果断把keras里面的7大数据集都下载下来学习用:
#from keras.datasets 找这个.py文件.spyder里面右键选择go to definition即可逐步深入就能找到
#7个下载地址.
#下载完进入C:Users张博.kerasdatasets 里面把下载后的放里面即可.
#要问问什么这么做,答案就是看源代码即可.逐步go to definition找代码里面语句找到的方法.
#'''
## EDA

# 加载数据，这个数据来自： https://s3.amazonaws.com/text-datasets/imdb_full.pkl
(x_train, y_train), (x_test, y_test) = imdb.load_data()
'''
keras官方文档地址:比中文的全多了
https://keras.io/datasets/ 
http://keras-cn.readthedocs.io/en/latest/other/datasets/
'''
# 探索一下数据情况
print(y_train.shape)
print(y_train[24000])
print(type(y_train[24000]))
'''
从打印可以看出来x_train的数据是一个25000*1个数组的数据.


在keras提供的IMDB数据集中，word被映射为一个大于0的整数，表示该单词出现频率的排名，
这样处理的目的是为了方便按照词频过滤单词，其中0用于表示unknown word

x_train的数据是一个25000*1个int的数据.得到的是12500个0,12500个1.表示情感的类别
x_test 和y_test跟上面类似.



'''


lens = list(map(len, x_train))
#len表示所有数列的长度
print(lens[0])
print(len(x_train[0]))

avg_len = np.mean(lens)
print(avg_len)
print(range(min(lens), max(lens) + 50, 50))

plt.hist(lens, bins=range(min(lens), max(lens) + 50, 50))


plt.show()
'''
得到hist图表示长度的分布情况
'''

# 由于长度不同，这里取相同的长度
m = max(max(list(map(len, x_train))), max(list(map(len, x_test))))
print(m)
print(x_train.shape)




#%%
print('m=%d' % m)
maxword = min(400, m)
print(x_train.shape)

x_train = sequence.pad_sequences(x_train, maxlen=maxword)
'''
上一步做完就变成2维tensor了
'''
print(x_train.shape)

x_test = sequence.pad_sequences(x_test, maxlen=maxword)
print(len(x_train[1546]))
'''
看出来x_train里面全都是长度400的list
'''
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
#词数
vocab_siz = np.max([np.max(x_train[i]) for i in range(x_train.shape[0])]) + 1
'''
最高的词频
'''




print(x_train[0])




print('vocab_siz=%d' % vocab_siz) #表示一共有多少个单词
print('x_train.shape=[%d,%d]' % (x_train.shape[0], x_train.shape[1]))
'''
下面就是feature 400 ,sample 25000的dnn学习即可.
'''
#构建模型
model = Sequential()
# 第一层是嵌入层,矩阵为 vocab_siz * 64
'''
maxword=400
'''


'''
任务:理解embedding层的作用和用法:
官网例子:
model = Sequential()
model.add(Embedding(1000, 64, input_length=10))
# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# now model.output_shape == (None, 10, 64), where None is the batch dimension.

input_array = np.random.randint(1000, size=(32, 10))

model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)
assert output_array.shape == (32, 10, 64)



'''
#跑一下例子也无妨,
'''
本质上embedding层是一个编码层从下面例子可以看出来.
还是nlp的例子,下面例子中0到999中一个数字代表一个汉字,那么input_array就表示
32*10个汉字.然而这种用0到999来表示一个汉字显然不符合汉字的意义.(这个构造是频率表排序)
例如我用0表示我,1表示是,2表示他.显然0和1不相似,0和2相似.但是编码上0,1,2看不出来的.
这就需要一个只有weight的全链接层来自动学习编码规则.学习如何从0到999每一个都编码
到一个64长度的向量.这样一个64长度的向量来表示一个汉字,就非常舒服了.处理效率也高了.
比你1000要快多了(因为1000的本质是1000维向量).

input_length参数非常重要他表示你输入数据的列数.
总结:一句话:把之前的one-hot编码 32*10*1000 压缩到了32*10*64,速度快了好多!!!!!!!!
ps:这个层是npl必须加的,也必须加在第一层上.
'''
model = Sequential()
model.add(Embedding(1000, 64, input_length=10))
# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# now model.output_shape == (None, 10, 64), where None is the batch dimension.

input_array = np.random.randint(1000, size=(32, 10))

model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)
assert output_array.shape == (32, 10, 64)
print(output_array.shape == (32, 10, 64)) #true



#重置一下即可.
model = Sequential()






model.add(Embedding(vocab_siz, 64, input_length=maxword))





#得到向量25000*maxword*64,这个地方原文写错了,说明他没理解embed层
model.add(Flatten())
# 加入多层全连接
model.add(Dense(2000, activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dense(50, activation='relu'))
# 最后一层输进0~1之间的值，像lr那样
model.add(Dense(1, activation='sigmoid'))
# 计算
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

#训练
model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=100, nb_epoch=2, verbose=1)
score = model.evaluate(x_test, y_test)
print(score)

View Code

nlp:

框架ntkl学习:

# -*- coding: utf-8 -*-
"""
Created on Fri Jul 20 10:58:02 2018

#如果跑不了就是编码问题,用记事本另存一下,把编码改成utf-8保存即可.
#利用cmd跑这种画图程序,就能旋转图片了.spyder不能旋转
@author: 张博
"""

'''
读乱码文件的方法:
a=open(u'C:/Users/张博/Desktop/news_sohusite_xml.smarty.dat','r',encoding='utf-8')
a=a.readlines()
print(a)

'''


'''
读取csv最稳的方法:
f=open(r'C:/Users/old.csv')
data = read_csv(f,header=None)



'''


'''
画图模板:
from matplotlib import pyplot
data=[]
pyplot.plot(data,color='black')
pyplot.show()

'''



'''
获取当前时间:
import datetime
nowTime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')#现在
nowTime=((nowTime)[:-3])
print(nowTime)
'''


'''
写文件的模板
with open(r'c:/234/wucha.txt','w') as f:
      wucha=str(wucha)
      f.write(wucha)
'''



'''
手动加断电的方法:raise 
'''


'''
excel表格实用技巧:
全选然后选开始-行和列-最适合的列宽.
这样看表格清晰多了!
'''


'''
时间序列画图
from matplotlib import pyplot
#画布大小
pyplot.rcParams['figure.figsize'] = (300, 3) # 设置figure_size尺寸
import matplotlib.dates as mdates
ax=plt.gca()
pyplot.rcParams['image.cmap'] = 'gray' # 
xfmt = mdates.DateFormatter('%y-%m-%d %H:%M')
ax.xaxis.set_major_formatter(xfmt)
#下面这个是时间序列的间隔时间
plt.xticks(pd.date_range(data[0][0],data[-1][0],freq='2H'),rotation=90)
#样式
pyplot.plot(data[:,0],data[:,1],color='red',linewidth = 0.7)

pyplot.show()
'''



'''
#画3d
import  matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D



from matplotlib import pyplot
pyplot.rcParams['figure.figsize'] = (3, 3) # 设置figure_size尺寸


fig=plt.figure()

ax3d=Axes3D(fig)    #绘制3D图形
ax3d.scatter(data[:,1],data[:,2],data[:,0],c='r',marker=".")

pyplot.show()
'''


'''
非数值编码
#编码
from sklearn.preprocessing import LabelEncoder

a=a.values #切换成ndarry

encoder = LabelEncoder()
for i in range(5):
 a[:,i] = encoder.fit_transform(a[:,i])
'''



'''
#标准化
from sklearn.preprocessing import MinMaxScaler



scaler = MinMaxScaler(feature_range=(0, 1))
a = scaler.fit_transform(a)
print(a)
'''




'''
降维可视化:
    
tSNE的核心思想是：把高维数据看做高维空间中的点xixi，然后用流形方法将其映射
到低维空间中的yiyi，要求保持其空间距离。即高维空间相距较近/远的点，
映射到低维空间中仍然较近/远。为了让挨得近的点靠的更近，距离用高斯函数来度量

from sklearn.manifold import TSNE
#data可以是多少维都可以,都能降成2维
tsne=TSNE()
tsne.fit_transform(data)  #进行数据降维,降成两维
#a=tsne.fit_transform(data_zs) #a是一个array,a相当于下面的tsne_embedding_
tsne=pd.DataFrame(tsne.embedding_) #转换数据格式

print(tsne)

tsne['聚类类别']=label_pred
print(tsne)
d=tsne[tsne[u'聚类类别']==0]
plt.plot(d[0],d[1],'r.')
 
d=tsne[tsne[u'聚类类别']==1]
plt.plot(d[0],d[1],'go')

d=tsne[tsne[u'聚类类别']==2]
plt.plot(d[0],d[1],'b*')

d=tsne[tsne[u'聚类类别']==3]
plt.plot(d[0],d[1],'y+')
plt.show()


'''


'''
下面我们用dnn来解决nlp问题
'''


# 探索一下数据情况
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np
from keras.datasets import imdb
from keras.layers import Embedding, Flatten, Dense
from keras.models import Sequential
from keras.preprocessing import sequence
#查询这个文档https://keras.io/datasets/
#'''
#果断把keras里面的7大数据集都下载下来学习用:
#from keras.datasets 找这个.py文件.spyder里面右键选择go to definition即可逐步深入就能找到
#7个下载地址.
#下载完进入C:Users张博.kerasdatasets 里面把下载后的放里面即可.
#要问问什么这么做,答案就是看源代码即可.逐步go to definition找代码里面语句找到的方法.
#'''
## EDA

# 加载数据，这个数据来自： https://s3.amazonaws.com/text-datasets/imdb_full.pkl
(x_train, y_train), (x_test, y_test) = imdb.load_data()
'''
keras官方文档地址:比中文的全多了
https://keras.io/datasets/ 
http://keras-cn.readthedocs.io/en/latest/other/datasets/
'''
# 探索一下数据情况
print(y_train.shape)
print(y_train[24000])
print(type(y_train[24000]))
'''
从打印可以看出来x_train的数据是一个25000*1个数组的数据.


在keras提供的IMDB数据集中，word被映射为一个大于0的整数，表示该单词出现频率的排名，
这样处理的目的是为了方便按照词频过滤单词，其中0用于表示unknown word

x_train的数据是一个25000*1个int的数据.得到的是12500个0,12500个1.表示情感的类别
x_test 和y_test跟上面类似.



'''


lens = list(map(len, x_train))
#len表示所有数列的长度
print(lens[0])
print(len(x_train[0]))

avg_len = np.mean(lens)
print(avg_len)
print(range(min(lens), max(lens) + 50, 50))

plt.hist(lens, bins=range(min(lens), max(lens) + 50, 50))


plt.show()
'''
得到hist图表示长度的分布情况
'''

# 由于长度不同，这里取相同的长度
m = max(max(list(map(len, x_train))), max(list(map(len, x_test))))
print(m)
print(x_train.shape)




#%%
print('m=%d' % m)
maxword = min(400, m)
print(x_train.shape)

x_train = sequence.pad_sequences(x_train, maxlen=maxword)
'''
上一步做完就变成2维tensor了
'''
print(x_train.shape)

x_test = sequence.pad_sequences(x_test, maxlen=maxword)
print(len(x_train[1546]))
'''
看出来x_train里面全都是长度400的list
'''
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
#词数
vocab_siz = np.max([np.max(x_train[i]) for i in range(x_train.shape[0])]) + 1
'''
最高的词频
'''




print(x_train[0])




print('vocab_siz=%d' % vocab_siz) #表示一共有多少个单词
print('x_train.shape=[%d,%d]' % (x_train.shape[0], x_train.shape[1]))
'''
下面就是feature 400 ,sample 25000的dnn学习即可.
'''
#构建模型
model = Sequential()
# 第一层是嵌入层,矩阵为 vocab_siz * 64
'''
maxword=400
'''


'''
任务:理解embedding层的作用和用法:
官网例子:
model = Sequential()
model.add(Embedding(1000, 64, input_length=10))
# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# now model.output_shape == (None, 10, 64), where None is the batch dimension.

input_array = np.random.randint(1000, size=(32, 10))

model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)
assert output_array.shape == (32, 10, 64)



'''
#跑一下例子也无妨,
'''
本质上embedding层是一个编码层从下面例子可以看出来.
还是nlp的例子,下面例子中0到999中一个数字代表一个汉字,那么input_array就表示
32*10个汉字.然而这种用0到999来表示一个汉字显然不符合汉字的意义.(这个构造是频率表排序)
例如我用0表示我,1表示是,2表示他.显然0和1不相似,0和2相似.但是编码上0,1,2看不出来的.
这就需要一个只有weight的全链接层来自动学习编码规则.学习如何从0到999每一个都编码
到一个64长度的向量.这样一个64长度的向量来表示一个汉字,就非常舒服了.处理效率也高了.
比你1000要快多了(因为1000的本质是1000维向量).

input_length参数非常重要他表示你输入数据的列数.
总结:一句话:把之前的one-hot编码 32*10*1000 压缩到了32*10*64,速度快了好多!!!!!!!!
ps:这个层是npl必须加的,也必须加在第一层上.
'''
model = Sequential()
model.add(Embedding(1000, 64, input_length=10))
# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# now model.output_shape == (None, 10, 64), where None is the batch dimension.

input_array = np.random.randint(1000, size=(32, 10))

model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)
assert output_array.shape == (32, 10, 64)
print(output_array.shape == (32, 10, 64)) #true



#重置一下即可.
model = Sequential()






model.add(Embedding(vocab_siz, 64, input_length=maxword))





#得到向量25000*maxword*64,这个地方原文写错了,说明他没理解embed层
model.add(Flatten())
# 加入多层全连接
model.add(Dense(2000, activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dense(500, activation='relu'))
# 最后一层输进0~1之间的值，像lr那样
model.add(Dense(1, activation='sigmoid'))
# 计算
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

#训练
model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=100, nb_epoch=2, verbose=1)
score = model.evaluate(x_test, y_test)
print(score)




#%%
'''
https://blog.csdn.net/zyp199301/article/details/70134203?locationNum=3&fps=1
学习这个

下载:https://www.gutenberg.org/files/11/11-0.txt


这个例子很奇葩,学习的不是单词而是字符,所以预测效果肯定不好,写出来的东西都不一定
是英语

'''

#load ascii text and covert to lowercase
filename = r"C:Users张博Desktop111.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()





#预处理
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

n_chars = len(raw_text)
n_vocab = len(chars)

print(n_chars)
print(n_vocab)
print(chars)



# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)
import numpy
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
#这个切法就是每100个char,来预测下一个char是不是准.
y=numpy.reshape(dataY,(len(dataY),1))
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM


import keras
from keras import regularizers

from keras import optimizers
print(y)
print(X)

model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))

model.add(Dense(y.shape[1], activation='softmax'))

adam = optimizers.Adam(lr=0.001, clipnorm=.5)



model.compile(loss='sparse_categorical_crossentropy', optimizer=adam)



model.fit(X, y, nb_epoch=2, batch_size=128, verbose=1)

#慢到崩溃,一个epoch 5分钟

#这个学完都是loss=nan怎么回事????????????


'''
这个工具很重要
在自然语言处理上，深度学习在机器翻译、自动问答、文本分类、情感分析、信息抽取、
序列标注、语法解析等领域都有广泛的应用。2013年末google发布的word2vec工具，
可以看做是深度学习在NLP领域的一个重要应用，虽然word2vec只有三层神经网络，但是
已经取得非常好的效果。通过word2vec，可以将一个词表示为词向量，将文字数字化，更好
的让计算机理解。使word2vec模型，
我们可以方便的找到同义词或联系紧密的词，或者意义相反的词等。
'''

#%%


'''
学习这篇
https://www.cnblogs.com/Newsteinwell/p/6034747.html
windows下无法用word2vec,所以放弃
'''
#linux 提取标签内容gerp即可.这个命令提取包含这个标签包含的行.
#cat news_tensite_xml.dat | iconv -f gbk -t utf-8 -c | grep "<content>"  > corpus.txt 
import codecs
a=open('C:/Users/张博/Desktop/news_sohusite_xml.smarty.dat',encoding='utf-8')
a=a.readlines()
out=[]
for i in a:
    if '<content>'  in i :
        out.append(i)
print(out)
a=''.join(out)
#用join之后ue什么的字符就没了
print(a)




#%%
'''
学习这个:https://blog.csdn.net/qq_27492735/article/details/78596618
学习nltk框架
'''



import nltk



'''
抓取网页
'''

import urllib.request

response = urllib.request.urlopen('http://php.net/')
html = response.read()


'''
清洗网页
'''
from bs4 import BeautifulSoup

import urllib.request
response = urllib.request.urlopen('http://php.net/')
html = response.read()
soup = BeautifulSoup(html,"html5lib")


text = soup.get_text(strip=True)


from bs4 import BeautifulSoup
import urllib.request

response = urllib.request.urlopen('http://php.net/')
html = response.read()
soup = BeautifulSoup(html,"html5lib")
text = soup.get_text(strip=True)

tokens = text.split()



'''
每一个tokens的词频统计,tokens表示的是一个单词的意思,最后打印频率表
'''

from bs4 import BeautifulSoup
import urllib.request
import nltk

response = urllib.request.urlopen('http://php.net/')
html = response.read()
soup = BeautifulSoup(html,"html5lib")
text = soup.get_text(strip=True)
tokens = text.split()
freq = nltk.FreqDist(tokens)
for key,val in freq.items():
    print (str(key) + ':' + str(val))

'''
画出频率表
'''
freq.plot(20, cumulative=False)
#%%
'''
处理停用词:所谓停用词就是那些虚词,这些东西是要剔除的.
'''
from nltk.corpus import stopwords

a=stopwords.words('english')
#打印停用词
print(a)


clean_tokens = list()
sr = stopwords.words('english')
for token in tokens:
    if token not in sr:
        clean_tokens.append(token)
        
        
        
'''
得到了去除停用词之后的词频
'''
freq = nltk.FreqDist(clean_tokens)
for key,val in freq.items():
    print (str(key) + ':' + str(val))



freq.plot(20, cumulative=False)

#%%
'''
下面进行tokenize:

文本没有Tokenize之前是无法处理的，所以对文本进行Tokenize非常重要的。
token化过程意味着将大的部件分割为小部件。
你可以将段落tokenize成句子，将句子tokenize成单个词，NLTK分别提供了
句子tokenizer和单词tokenizer。
'''

#下面是橘子tokenize
from nltk.tokenize import sent_tokenize

mytext = "Hello Adam, how are you? I hope everything is going well. Today 
            is a good day, see you dude."
print(sent_tokenize(mytext))


#下面是单词tokenize
from nltk.tokenize import word_tokenize

mytext = "Hello Mr. Adam, how are you? I hope everything is going well. 
Today is a good day, see you dude."
print(word_tokenize(mytext))

#%%
#切法语试试
from nltk.tokenize import sent_tokenize

mytext = "Bonjour M. Adam, comment allez-vous? J'espère que tout va bien. Au
jourd'hui est un bon jour."
print(sent_tokenize(mytext,"french"))


#%%
from nltk.corpus import wordnet #缺包就用nltk.download()装就行了

syn = wordnet.synsets("pain")

print(type(syn))
print(syn)
print(syn[0].definition())
print(syn[0].examples())


from nltk.corpus import wordnet

syn = wordnet.synsets("NLP")
print(syn[0].definition())
syn = wordnet.synsets("Python")
print(syn[0].definition())

#%%

#打印所有的同义词
from nltk.corpus import wordnet

synonyms = []
for syn in wordnet.synsets('Computer'):
    
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())
print(synonyms)
#打印所有的反义词
from nltk.corpus import wordnet

antonyms = []
for syn in wordnet.synsets("small"):
    for l in syn.lemmas():
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
print(antonyms)












#%%
#词干提取:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
print(stemmer.stem('working'))
print(stemmer.stem('worked'))












#%%变体还原:有事后词干提取是错的,需要变体还原
from nltk.stem import WordNetLemmatizer
print('原始单词')
print('increases')
lemmatizer = WordNetLemmatizer()
print('变体还原的结果')
print(lemmatizer.lemmatize('increases'))



from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
print('词干提取的结果')
print(stemmer.stem('increases'))


#%%
#改变词性的变体还原
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize('playing', pos="v"))


from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('playing', pos="v"))
print(lemmatizer.lemmatize('playing', pos="n"))
print(lemmatizer.lemmatize('playing', pos="a"))
print(lemmatizer.lemmatize('playing', pos="r"))


'''
总结:
    
词干提取不会考虑语境，这也是为什么词干提取比变体还原快且准确度低的原因。

个人认为，变体还原比词干提取更好。单词变体还原返回一个真实的单词，即使它不是同一个
单词，也是同义词，但至少它是一个真实存在的单词。
'''

View Code

批量把文件名修改成为编号.py

#下面代码就是把路径path里面的内容都修改成他前面3个字符.因为这个例子里面正好前3个字符是编号




import os
path='E:/计算机网络原理精讲视频教程/视频'
name=os.listdir(path)
for temp in name:
    new_name=temp[:3]
    new_name=str(int(new_name))
    os.rename(path+'/'+temp,path+'/'+new_name+)

import os
path='E:/计算机网络原理精讲视频教程/视频'
name=os.listdir(path)
for temp in name:
    new_name=temp
    os.rename(path+'/'+temp,path+'/'+'课程'+new_name+'.mp4')
#所以说要让播放器自动生成列表需要一个名称后面加数字才行.

View Code

查看全文

相关阅读:
Cleaning Up Children Asynchronously
advacing lnux program 4.1.5 Thread Attributes[copy]
advacing lnux program Threads Return Value[copy]
批处理，所有子文件夹下面的所有文件
 .NET连接sybase乱码问题
 Asp.net forms认证遇到的一个奇怪的问题和测试过程
 WCF错误
 wcf超时错误
 gridControl控件显示交叉表
 K3 12.1修改报表Bug

原文地址：https://www.cnblogs.com/zhangbo2008/p/9163675.html