jieba库常用函数
文本字符统计实例
一 统计哈默雷特中出现次数最多的前10个词
#将文本归一化
def getText():
txt=open('hamlet.txt','r').read()
txt=txt.lower()
for cf in '!@、\";:,.()[]{}<>=-_*&^%$#`~/?|\':
txt=txt.replace(ch,'')
return txt
hanmletTxt=getText()
words=hamletTxt.spliit()
counts={}
for word in words:
counts[word] = counts.get(word,0)+1
items=list(count.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(10):
word,count =items[i]
print('{0:<10}{1:>5}'.format(word,count))
输出结果:
二三国演义中出现最多次数的人名
import jieba
txt = open("threekingdoms.txt","r",encoding="utf-8"). read()
excludes = {"将军","却说","荆州","二人","不可","不能","如此"}
words = jieba. lcut(txt)
counts = {}
for word in words :
if len(word) == 1:
continue
elif word == "诸葛亮"or word == "孔明日":
rword = "孔明"
elif word == "关公"or word =="云长":
rword = "关羽"
elif word =="玄德"or word == "玄德日":
rword = "刘备"
elif word == "孟德"or word == "丞相日":
rword = "曹操"
else:
rword = word
counts[ rword] = counts.get(rword,0) + 1
for word in excludes:
del counts[word]
items = list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(10):
word,count=items[i]
print('{0:<10}{1:>5}'.format(word,count))
输出结果:
转载自原文链接, 如需删除请联系管理员。
原文链接:jieba 库的常用函数以及实战应用 ,文本统计字符次数,转载请注明来源!