Wikipedia Processing
For Chinese, https://dumps.wikimedia.org/zhwiki/latest/
zhwiki-latest-pages-articles.xml.bz2
For English, https://dumps.wikimedia.org/enwiki/latest/
enwiki-latest-pages-articles.xml.bz2
Chinese
Processing by following order:
- Extraction
- Convert Traditional Chinese to Simplified Chinese
- Keep in utf-8 characters
- keep in Chinese characters
- Segmentation
Extraction
Extracting plain text from zhwiki-20200101-pages-articles.xml.bz2 by following code.
# encoding:utf8
import sys
from gensim.corpora import WikiCorpus
from tqdm import tqdm
if __name__ == '__main__':
if len(sys.argv) < 3:
print('Usage: python3 wikipedia_extraction.py wikipedia.xml.bz2 wikipedia.txt')
file_name = sys.argv[1:]
fo = open(file_name[1], encoding='utf8', mode='w')
wiki = WikiCorpus(fname=file_name[0], lemmatize=False, dictionary=dict())
for article in tqdm(wiki.get_texts()):
for sentence in article:
fo.write("%s" % sentence)
fo.write("
")
Converting
To convert Traditional Chinese to Simplified Chinese by following bash command.
opencc -i wikipedia.zh.txt -o wikipedia.zhs.txt -c t2s.json
t2s.json obtained from https://github.com/BYVoid/OpenCC/blob/master/data/config/t2s.json , but we can see it as follow.
{
"name": "Traditional Chinese to Simplified Chinese",
"segmentation": {
"type": "mmseg",
"dict": {
"type": "ocd",
"file": "TSPhrases.ocd"
}
},
"conversion_chain": [{
"dict": {
"type": "group",
"dicts": [{
"type": "ocd",
"file": "TSPhrases.ocd"
}, {
"type": "ocd",
"file": "TSCharacters.ocd"
}]
}
}]
}
Keep utf-8
To use following bash command to keep utf-8 characters in.
iconv -c -t UTF-8 -o wikipedia.zhs.utf8.txt wikipedia.zhs.txt
Keep Chinese
Keeping only Chinese characters in corpus by following code.
# encoding:utf8
# Filter out un-Chinese characters
import sys
from tqdm import tqdm
if __name__ == '__main__':
if len(sys.argv) < 3:
print("Usage: python3 wikipedia.zhs.utf8.txt wikipedia.zhs.utf8.chi.txt")
exit(1)
fout = open(sys.argv[2], encoding='utf8', mode='w')
with open(sys.argv[1], encoding='utf8') as fin:
for line in tqdm(fin):
for word in line:
for char in word:
if char == ' ' or char == '
':
fout.write(char)
if char >= u'u4e00' and char <= u'u9fa5': # is a Chinese character
fout.write(char)
Segmentation
To segment corpus by following code. This is a simple segmentation program.
# encoding:utf8
# Just a simple segmentation program
import sys
import jieba
from tqdm import tqdm
def sentences(fpath):
with open(fpath, encoding='utf8') as f:
for line in f:
yield line.strip()
if __name__ == '__main__':
if len(sys.argv) < 3:
print("Usage: python3 SimSeg.py in-path out-path")
exit(1)
jieba.initialize()
f = open(sys.argv[2], encoding='utf8', mode='w')
for sentence in tqdm(sentences(sys.argv[1])):
words = list(jieba.cut(sentence, cut_all=False))
while ' ' in words:
words.remove(" ")
f.write("%s
" % " ".join(words))