一:准备数据
机器学习的算法要取得好效果,离不开数据。首先从网上拉一些数据用来测试。首先拉取英文数据:
我从英文小说网拉取了英文数据,对英文小说的句子进行清洗,得到我们想要的:
with open('en_data.txt','r',encoding='utf8')as f:# en_data为小说的内容 data=f.read().split(' ') # print(data) lists=[] # p=re.compile(r"[w]+[。,?!]") p=re.compile(r"[w -]+") for i in data: l=p.findall(i) lists.append(l) # print(lists) with open('data.csv','w',encoding='utf8') as f: for i in lists: if i: for b in i: if len(b)>20: e=str(b).strip() e=e.strip(',.') d=e+',en ' f.write(d) print(d)
然后拉取中文数据:我从新闻网站获得很多的中文文章,处理数据
with open('ch_data.txt','r',encoding='utf8')as f:# ch_data为新闻的内容 data=f.read().split(' ') # print(data) lists=[] p=re.compile(r"[w]+[。,?!]") # p=re.compile(r"[w -]+") for i in data: l=p.findall(i) lists.append(l) # print(lists) with open('data.csv','a',encoding='utf8') as f: for i in lists: if i: for b in i: if len(b)>10: e=str(b).strip() e=e.strip(',。') d=e+',ch ' f.write(d) print(d)
经过上面处理得到data.csv格式的数据,建议中文的数据量和英文的数据量差不多。
创建一个语种识别的类:
import re from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB class LanguageDetector(): def __init__(self, classifier=MultinomialNB()): self.classifier = classifier # 在降噪的数据上抽取出来有用的特征,我们抽取1-gram和2-gram的统计特征 self.vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=1000, preprocessor=self._remove_noise) """ 具体参数 vec = CountVectorizer( lowercase=True, # lowercase the text analyzer='char_wb', # tokenise by character ngrams ngram_range=(1,2), # use ngrams of size 1 and 2 max_features=1000, # keep the most common 1000 ngrams preprocessor=remove_noise ) """ def _remove_noise(self, document): # 这个函数去除噪音 noise_pattern = re.compile("|".join(["httpS+", "@w+", "#w+"])) clean_text = re.sub(noise_pattern, "", document) return clean_text def features(self, X): return self.vectorizer.transform(X) def fit(self, X, y): self.vectorizer.fit(X) self.classifier.fit(self.features(X), y) # 进行训练 def predict(self, x): # 进行预测 return self.classifier.predict(self.features([x])) def score(self, X, y): # 统计准确率 return self.classifier.score(self.features(X), y)
实践测试:
in_f = open('data.csv', 'r', encoding='utf8') lines = in_f.readlines() in_f.close() dataset = [(line.strip()[:-3], line.strip()[-2:]) for line in lines] # 以元组的形式读入数据 # print(dataset) # 将元数据集分割成训练集的测试集 x, y = zip(*dataset) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1) language_detector = LanguageDetector() language_detector.fit(x_train, y_train) print(language_detector.predict( 'Life is a journey. What we should care about is not where its headed but what we see and how we feel. ')) print(language_detector.score(x_test, y_test))
# ['en'] 预测是英文
# 0.9849048348282652 准确率
完整代码:

# -*- coding: utf-8 -*- # @Author : FELIX # @Date : 2018/3/28 11:05 import re from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB class LanguageDetector(): def __init__(self, classifier=MultinomialNB()): self.classifier = classifier # 在降噪的数据上抽取出来有用的特征,我们抽取1-gram和2-gram的统计特征 self.vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=1000, preprocessor=self._remove_noise) """ 具体参数 vec = CountVectorizer( lowercase=True, # lowercase the text analyzer='char_wb', # tokenise by character ngrams ngram_range=(1,2), # use ngrams of size 1 and 2 max_features=1000, # keep the most common 1000 ngrams preprocessor=remove_noise ) """ def _remove_noise(self, document): # 这个函数去除噪音 noise_pattern = re.compile("|".join(["httpS+", "@w+", "#w+"])) clean_text = re.sub(noise_pattern, "", document) return clean_text def features(self, X): return self.vectorizer.transform(X) def fit(self, X, y): self.vectorizer.fit(X) self.classifier.fit(self.features(X), y) # 进行训练 def predict(self, x): # 进行预测 return self.classifier.predict(self.features([x])) def score(self, X, y): # 统计准确率 return self.classifier.score(self.features(X), y) in_f = open('data.csv', 'r', encoding='utf8') lines = in_f.readlines() in_f.close() dataset = [(line.strip()[:-3], line.strip()[-2:]) for line in lines] # 以元组的形式读入数据 # print(dataset) # 将元数据集分割成训练集的测试集 x, y = zip(*dataset) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1) language_detector = LanguageDetector() language_detector.fit(x_train, y_train) print(language_detector.predict( 'Life is a journey. What we should care about is not where its headed but what we see and how we feel. ')) print(language_detector.score(x_test, y_test))