re、词云 - 走看看

zoukankan html css js c++ java

re、词云

正则：

re.S使点也能匹配到；re.I不区分规则中的大小写；re.X忽略空格及#后的注释；re.M把^和$由文首文末变为各行的首尾。

Egの删除各行行尾的alex，alex不区分大小写：

import re

s='''ja654alEx

runAlex87

90helloaLeX'''

m=re.sub('alex$','',s,count=0,flags=re.M+re.I)

print(m)

******************分割线*******************

pattern中有正则字母，或者就是表示普通标点的正则标点，开头可不必加r。只有表示自身的\，以及pattern中第num个()的 um如2，才必须加r。

多单词匹配用…(单词a|单词b|单词c)…，即|外再套个()。

Egの提取姓名：

import re

pattern=re.compile('((Bob|Jerry|Tom) Lee)')

s='Jerry Lee666；Tom Lee+-*/、Bob Lee HELLO'

r0=[item[0] for item in pattern.findall(s)] #提取所有姓名

r1=pattern.findall(s)[0][0] #这3行都是只提取第一个姓名，findall此处用了两次下标

r2=pattern.match(s).groups()[0]

r3=pattern.match(s).group(1)

print(r0,r1,r2,r3,sep=' ')

******************分割线*******************

预搜索：

目标之后(?=666)为是而(?!666)为否，目标之前(?<=666)为是而(?<!666)为否。<和+*?等元字符冲突，故尽量避免使用前置预搜索。预搜索的()不会占用 um的名额。

为单词的首尾。Egの后面没紧跟￥的单词res：res(?!￥)

Egの给json的每个key，添加个前缀ids_：

import re

s='{"name" :"jerry", "age":32}'

s1=re.sub('(w+?"s*:)',r'ids_1',s) #re.sub禁，故套个()用1来指代pattern全文

s2=re.sub('(?<=")(w+?)(?="s*:)',r'ids_1',s)

******************分割线*******************
()：

查找栏或规则中第3个()的内容，在替换栏对应的是：PyCharm为…$3…，VBA代码以及正则软件的替换表达式也是用$；而EditPlus却是…3…，python代码里的正则也是用，如re.sub(r'…','* um*',text)。

()的序列始于1，顺序从外至内，从左至右。而PyCharm的$0或EditPlus的却是整个查找值或规则，与哪个()都无关。

re.sub(*)的pattern所匹配到的全文，在repl对应为lambda x:x[0]，或源自re.search(*).group(0)的完整写法lambda x:x.group(0)，或最简写法r''(只不过会被识别为x00而显示为□而不能用，好在可对pattern套个()从而使用1来指代查找栏的全文)。repl在匹配出查找项后若想调用它，如作个替换或用作某个{}的key， um就无能为力了。

Egの替换html的text里的所有a为ǎ，标签里的不动：

import re

html='<div><a href="url">1a</a>1b 1c 1d 1a 1b 1d a1</div>'

r1=re.sub('a(?=[^<>]*?<)','ǎ',html)

r2=re.sub('>.*?<',lambda ptn:ptn[0].replace('a','ǎ'),html)

Egの查找栏()匹配到的各项作为{}的key，对应的value，作为查找栏全文的替换值：

html='&w1;&as2;&d3f;&zx4y;'

d=dict(w1=2,as2='0',d3f='1',zx4y=8)

pattern=re.compile('&([a-z0-9]+?);')

html=pattern.sub(lambda ptn:str(d.get(ptn[1])),html) #d[r'1']无效

******************分割线*******************

(?:xy)+：使()内的xy只作为一整体而不蕴含 um意，re.findall的返回≥此()内匹配到的xy。

查找栏也有用 um+的时候，如匹配第num个()内的重复单词。

Egの连续重复的单词去重：

#在PyCharm的查找替换栏的应用：(word)1+替换为$1

import re

s='wwwwert啦啦+-*/666嘿PythonPython' #下文匹配重复单词的内()算是第2个()，故用2+

提取连续出现的词=[x[0] for x in re.findall(r'((.+?)2+)',s)]

提取去重之后的词=re.findall(r'(.+?)1+',s)

****************************************分割线****************************************

纵向打印古诗：

import re,itertools

poetry='鹅鹅鹅，曲项向天歌。。白毛浮绿水，，，红掌拨清波~~！！'

t=re.split('[^一-龥]+',re.sub('[^一-龥]+$','',poetry)) #去末尾的非汉字，再以非汉字分割

t.reverse()

print(t) #zip自其首参序列的各子元素内，取同级孙元素组成新序列，不足位的则置空

[print('   '.join(x)) for x in itertools.zip_longest(*t,fillvalue='')]

#[print(y,end='   ') if y!=x[-1] else print(y) for x in itertools.zip_longest(*t,fillvalue='') for y in x]

******************分割线*******************

①zip函数以父序列的各元素作函参序列：*父序列名；

②双层for循环：外层for在前，内层for在后

③列表解析，if写在for前得补个else 0之类的，在for后则不必；

执行甲 if True else 执行乙：True and 执行甲 or 执行乙

******************分割线*******************

乘法口诀：

1*1=1

1*2=2 2*2=4

………………

[print(f'{x}*{y}={x*y}',end=(' ' if x<y else ' ')) for y in range(1,10) for x in range(1,10) if x<=y]

#[print('%s*%s=%s' %(x,y,x*y),end=(' ' if x<y else ' ')) for y in range(1,10) for x in range(1,10) if x<=y]

****************************************分割线****************************************

多个词的替换或提取——flashtext：

若2+个中文关键词，或中文与单词在正文携手出现，无空格等分隔，后者会被无视。

from flashtext import KeywordProcessor

kp=KeywordProcessor() #参数大小写敏感，默认为False

def Egの大杂烩():

kp.remove_keywords_from_list(list(kp.get_all_keywords().keys()))

kp.add_non_word_boundary('、') #与左右的汉字或单词3者，组合为1个新词

olds='abcd eFg higk lmn opQ rst'.split() #news应答''或None或无对应，换为old自身

news=f'子丑寅卯辰巳午未 {""} 申酉戌亥'.split(' ')

[kp.add_keyword(old,new) for old,new in zip(olds,news)] #,多换多

kp.add_keywords_from_dict(dict(秦=['甲','乙','丙丁'],唐宋=['6'])) #多换1

replace=kp.replace_keywords('乙甲乙hello,EFG OPQ rSt 6') #.extract_keywords

print(replace)

Egの大杂烩()

*******分割线*******

def Egの去除文件中的若干关键词(path):

kp.remove_keywords_from_list(list(kp.get_all_keywords().keys()))

kp.add_keywords_from_dict({'乄乄':['的','了','是','有','在','不','子','个','世界']})

with open(path) as f:

result=kp.replace_keywords(f.read()).replace('乄乄','')

with open(path,'w') as f:

f.write(result)

Egの去除文件中的若干关键词('E:/龙符.txt')

****************************************分割线****************************************

汉字→拼音：

from xpinyin import Pinyin

py=Pinyin()

s='朝辞白帝彩云间，千里江陵一日还。两岸猿声啼不住，轻舟已过万重山。'

拼音=py.get_pinyin(s,' ',True,'upper') #2~4参的默认：分隔符-，不注音，小写

首字母=py.get_initials('你好啊','|').lower() #默认分隔符-，大写

******************分割线*******************

分析归类の诗词的作者是李白还是杜甫：

python -m pip install textblob -i https://pypi.douban.com/simple/

python -m textblob.download_corpora -i https://pypi.douban.com/simple/

import jieba,os

from textblob.classifiers import NaiveBayesClassifier

def handleJieba(string): #结巴分词并去除常用标点

result=list(jieba.cut(string,True))

for noise in ['，','。',' ','']:

while noise in result:

result.remove(noise)

return result

def materialTrain(comparedFiles=[]): #把李、杜等人的诗集用作训练素材

files={};train=[]

for txt in comparedFiles:

name=os.path.basename(txt).split('.')[0]

files.update({name:0})

with open(txt,encoding='utf8') as f:

result=handleJieba(f.read())

[train.append((word,name)) for word in result]

classifier=NaiveBayesClassifier(train)

makeDecisions(files,classifier)

def makeDecisions(files,classifier): #最终的分析决策

words=handleJieba(input('请输入一句诗词：'))

for word in words:

classifyResult=classifier.classify(word)

if classifyResult in files:

files[classifyResult]+=1

for name in files:

print(f'{name}的概率：%0.2f%%' %(files[name]/len(words)*100))

comparedFiles=['E:/李白.txt','E:/杜甫.txt']

materialTrain(comparedFiles)

******************分割线*******************

短文本分类工具：目前只支持Python2

from tgrocery import Grocery

gc=Grocery('短文本分类工具')

train=[('education', '名师指导托福语法技巧：名词的复数形式'),

    ('education', '中国高考成绩海外认可是狼来了吗？'),

    ('sports', '图文：法网孟菲尔斯苦战进16强孟菲尔斯怒吼'),

    ('sports', '四川成都举行全国长距登山挑战赛近万人参与'),]

gc.train(train) #list：各子是类别标签+语料文本构成的tuple；2参delimiter默认tab，用于文件路径

#gc.train('E:/train.txt')    #文件路径：1行(类别标签+tab空格+语料文本)为1个训练样本

gc.save()   #本地自动创建个文件夹，名字为类初始化时的首参

gc=Grocery('短文本分类工具')   #再次加载模型，开始做问答或判断题

gc.load()

问答=gc.predict('考生必读：新托福写作考试评分标准')

test=[('education', '福建春招考试报名18日截止 2月6日考试'),

    ('sports', '意甲首轮补赛交战记录:国米10年连胜'),]

判断=gc.test(test)

****************************************分割线****************************************

结巴分词&词云：

结巴分词的俩鸡肋方法.cut(s)、.tokenize(s)：

import jieba

s='qw ert qwe rt'

无意义的原始分词结果 = list(jieba.cut(s))[:50]

各单词起闭止开的索引=list(jieba.tokenize(s))

for word in 各单词起闭止开的索引:

if 'er' in word[0]: #某个单词首次出现的位置

print(word);break

******************分割线*******************

jieba.analyse：

.extract_tags(*)：jieba.cut(s)后剔除无意义词并汇总，再顺序取topN。

首参str，在set_stop_words是追加的自定义踢词的文件路径，在extract_tags是待分析的正文。

Egの为1本小说制作词云：

from jieba.analyse import set_stop_words,extract_tags

from wordcloud import WordCloud,STOPWORDS as sw

import numpy as np

from PIL import Image

#①结巴分词提取高频中文词：

stopWords='D:/中文停用词表.txt'   #1个过滤词占1行

txtFile='F:/New Download/example.txt'

with open(txtFile) as f:

    sentence=f.read()

set_stop_words(stopWords)   #结巴分词の过滤：自定义中文

#words=extract_tags(sentence,50)

#text=' '.join(words)

#词云网站多数有个先后足矣，wc.generate_from_frequencies({*})等还需提供词频

words=extract_tags(sentence,topK=50,withWeight=True)

frequencies={word[0]:int(word[1]*1000) for word in words}

#②{词:词频,}数据导入词云：

backImg='F:/New Download/background.png'

bg=np.array(Image.open(backImg))    #bg=scipy.misc.imread(backImg)

wc=WordCloud('simhei.ttf',mask=bg,max_font_size=81) #背景是ndarray对象

#wc.stopwords=sw|set(open(stopWords).readlines())   #词云の过滤：内置英文及自定义中文

#wc.generate_from_frequencies({词1:词频1,})，wc.generate('空格分隔的各词')

wc.generate_from_frequencies(frequencies)   #wc.generate(text)

#③展示图：法1のImage库用图片路径str，法2のplt库用WordCloud对象

saveImg='F:/New Download/result.jpg'

wc.to_file(saveImg)

Image.open(saveImg).show()

#import matplotlib.pyplot as plt

#plt.imshow(wc)

#plt.axis('off')

#plt.savefig(saveImg,dpi=240,bbox_inches='tight')

#plt.show()

******************分割线*******************

词云网站https://wor删dart.com/create(加载完要等几秒)的用法：

左侧的：WORDSのImport→保持顺序贴入各词(勾上俩Remove，若有词频且以分割则勾上CSV)→SHPAGES选个图→FONTSのAdd font(如选个本机的雅黑字体，网站提供的那些都不支持中文)→LAYOUT设字体倾斜→STYLEのCustom设字体五颜六色

→右上的Visualize

→右顶的DOWNLOAD(chrome设为内置下载)。

****************************************分割线****************************************

把两张图(尺寸和模式要相同)，合成为一张新图：

from PIL import Image

backGround=Image.open('F:/666.png').convert('RGBA')

img=Image.open('F:/1.jpg').resize(backGround.size).convert('RGBA')

# backGround.paste(img,(0,40)) #按各自的尺寸合成

# backGround.show() #.save('F:/result.png')

# result=Image.blend(img,backGround,0.2) #按透明度合成

result=Image.alpha_composite(img,backGround) #背景为png透明素材

result.show()

******************分割线*******************

Egの把许多图均匀整合到一张正方形内：

import glob,random,math

from PIL import Image

def mixImages(totalSize=640):

images=glob.glob(imagesFolderPath+'*.jpg')

totalNum=len(images)

vnum=math.ceil(math.sqrt(totalNum)) #纵向图片数：总数的方根的天花板

hnum1=math.ceil(totalNum/vnum) #除末排的横向图片数，如5､6为2，7~12为3

frontNum=hnum1*(vnum-1)

vsize=int(totalSize/vnum)

hsize1=int(totalSize/hnum1);hsize2=int(totalSize/(totalNum-frontNum))

sizes=[(hsize1,vsize) if n<frontNum else (hsize2,vsize) for n in range(totalNum)]

#4通道RGBA的png图，和3通道RGB的bmp和jpg图，都能粘贴进画布

toImage=Image.new('RGBA',(totalSize,totalSize))

x=0;y=0 #画布游标

random.shuffle(images)

for index,name in enumerate(images):

img=Image.open(name).resize(sizes[index])

toImage.paste(img,(sizes[index][0]*x,vsize*y))

x+=1

if x==hnum1:

x=0;y+=1

toImage.show()

r,g,b=toImage.split()[:3]

toImage=Image.merge('RGB',(r,g,b))

toImage.save('D:合成图.jpg')

if __name__ == '__main__':

imagesFolderPath='D:待整合的各图'

mixImages(totalSize=720)

******************分割线*******************

Egの分别把各图填充为正方形并均匀切为9块：

from PIL import Image

import os,pathlib

def fill(originalImage): #将原图居中贴在方形画布上

width,height=originalImage.size

newLength=max(originalImage.size)

newImage=Image.new(originalImage.mode,(newLength,newLength),color='white')

leftup=(int((newLength-width)/2),0) if width<height else (0,int((newLength-height)/2))

newImage.paste(originalImage,leftup)

newImage.save(newFilePath)

return newImage

def cut(newImage): #把方形的新图均匀切为9块

width,height=newImage.size

pieceLenth=int(width/3)

pieces=[]

for y in range(0,3):

for x in range(0,3):

piece=(x*pieceLenth,y*pieceLenth,(x+1)*pieceLenth,(y+1)*pieceLenth)

pieces.append(newImage.crop(piece))

return pieces

def save(pieces): #保存切自方形新图的9块小切图

for index,piece in enumerate(pieces):

piece.save(newFilePath.replace('_new','_'+str(index),1))

def walk(folderPath): #遍历待切的各原图

global newFilePath

filesPath=[str(path) for path in pathlib.Path(folderPath).rglob('*.jp*')]

for filePath in filesPath:

originalImage=Image.open(filePath)

newFolder=os.path.splitext(filePath)[0]

newFilePath=os.path.split(newFolder)[-1]+'_new'+os.path.splitext(filePath)[-1]

if not os.path.isdir(newFolder):

os.makedirs(newFolder)

os.chdir(newFolder)

newImage=fill(originalImage)

pieces=cut(newImage)

save(pieces)

if __name__=='__main__':

folderPath='D:图片'

walk(folderPath)

******************分割线*******************

制作验证码：

import random,string

from PIL import Image,ImageDraw,ImageFont,ImageFilter

#每个验证码字符内又有几个字符

def rand_font(couple):

s=""

for j in range(couple):

n=random.randint(1,3) #2为数字，1&3为大小写字母

if n==2:

s+=str(random.randint(0,9))

else:

s+=random.choice(string.ascii_letters)

return s

#验证码各字符的颜色

def rand_fontColor():

return (random.randint(64,255),random.randint(64,255),random.randint(64,255))

#背景各像素的颜色

def rand_drawPixelColor():

return (random.randint(32,127),random.randint(32,127),random.randint(32,127))

#设置背景图片的宽高

width=60*4

height=60

img=Image.new('RGB',(width,height),(0,0,0)) #创建背景图片：模式、尺寸、颜色

draw=ImageDraw.Draw(img) #创建绘图对象

font=ImageFont.truetype('C:/Windows/Fonts/Arial.ttf',36) #创建字体对象

#填充背景图片每个像素点的颜色

for i in range(width):

for j in range(height):

draw.point((i,j),rand_drawPixelColor())

#写入4个验证码字符，每个字符内又含2个字符

for i in range(4):

draw.text((60*i+10,10),text=rand_font(2),fill=rand_fontColor(),font=font)

#图片加噪，增加识别难度

img=img.filter(ImageFilter.BLUR)

img.show()

******************分割线*******************

给图片配文字：

from PIL import Image,ImageDraw,ImageFont

customFont=ImageFont.truetype('F:/msyh.ttc',50)

image=Image.open('F:/原图.jpg')

width,height=image.size

draw=ImageDraw.Draw(image) #创建绘图对象

draw.text((width*1/3,height/2),'陈独秀你坐下！！','#ff0000',customFont) #图上加字

image.save('F:/新图.jpg','jpeg')

******************分割线*******************

识别图片中的文字：

tesseract-ocr.exe安装到默认路径，勾选Additional language下的Chinese(simplified)

pytesseract.py中改tesseract_cmd = 'C:/Program Files (x86)/Tesseract-OCR/tesseract.exe'

爬到的图片b节码不存图而直接打开：Image.open(io.BytesIO(response.content)).show()

from PIL import Image

from pytesseract import image_to_string

config="--tessdata-dir 'C:/Program Files (x86)/Tesseract-OCR/tessdata'"

with Image.open('F:/New Download/1.jpg') as img:

text=image_to_string(img,'chi_sim',config=config).replace(' ','')

print(text)

******************分割线*******************

素描：

from PIL import Image

import numpy as np

a=np.asarray(Image.open('D:原图.jpg').convert('L')).astype('float')

depth=10. # (0-100)

grad=np.gradient(a) # 取图像灰度的梯度值

grad_x,grad_y=grad # 分别取横纵图像梯度值

grad_x=grad_x*depth / 100.

grad_y=grad_y*depth / 100.

A=np.sqrt(grad_x **2+grad_y **2+1.)

uni_x=grad_x / A

uni_y=grad_y / A

uni_z=1. / A

vec_el=np.pi / 2.2 # 光源的俯视角度，弧度值

vec_az=np.pi / 4. # 光源的方位角度，弧度值

dx=np.cos(vec_el)*np.cos(vec_az) # 光源对x 轴的影响

dy=np.cos(vec_el)*np.sin(vec_az) # 光源对y 轴的影响

dz=np.sin(vec_el) # 光源对z 轴的影响

b=255*(dx*uni_x+dy*uni_y+dz*uni_z) # 光源归一化

b=b.clip(0,255)

im=Image.fromarray(b.astype('uint8')) # 重构图像

im.save('D:素描.jpg')

******************分割线*******************

雪花飘飘：

import pygame,random

pygame.init() #初始化

size=(1364,569) #屏幕长宽同背景图

screen=pygame.display.set_mode(size)

bg=pygame.image.load('F:/New Download/snow.jpg')

pygame.display.set_caption('Snow Animation')

snows=[]

for i in range(200): #初始化雪花：[x坐标,y坐标,x轴速度,y轴速度]

x=random.randrange(0,size[0])

y=random.randrange(0,size[1])

sx=random.randint(-2,2) #.randint(3,6)=.choice(range(3,7,1))=.randrange(3,7,1)

sy=random.randint(4,7)

snows.append([x,y,sx,sy])

clock=pygame.time.Clock()

num=0

done=False

while not done:

screen.blit(bg,(0,0)) #图片背景；黑背景screen.fill((0,0,0))

for snow in snows: # 雪花列表循环

pygame.draw.circle(screen,(255,255,255),snow[:2],snow[3]-3) #画雪花：颜色,位置,大小

snow[0] +=snow[2] # 移动雪花位置（下一次循环起效）

snow[1] +=snow[3]

if snow[1] > size[1]: # 如果雪花落出屏幕，重设位置

snow[1]=random.randrange(-50,-10)

snow[0]=random.randrange(0,size[0])

pygame.display.flip() # 刷新屏幕

clock.tick(20)

num+=1

if num<5:

pygame.image.save(screen,f'F:/New Download/snow-{num}.jpg')

for event in pygame.event.get():

if event.type==pygame.QUIT:

done=True

pygame.quit()

查看全文

相关阅读:
Python的四种常见数据结构比较
 LeetCode Notes_#53 Maximum Subarray
LeetCode Notes_#38 Count and Say
LeetCode Notes_#6 Zigzag Conversion
LeetCode Notes_#5 Longest Palindromic Substring
《美国纽约摄影学院摄影教材》
《艺术的故事》
《Don't make me think》
《Geospatial Data Science Techniques and Applications》
《程序员的自我修养：链接、装载与库(完整版).pdf》

原文地址：https://www.cnblogs.com/scrooge/p/7693541.html