1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
| import jieba as jie
text = (open('三国演义.txt','r',encoding='utf-8')).read() words = jie.lcut(text) nowords ={"这个","引兵","次日","人马","不知","汉中","众将", "只见","大喜","天下","东吴","于是","今日","不敢","魏兵", "陛下","太守","天子","一面","原来","令人","江东","喊声", "下马","何不","大军","忽报","先生","百姓","然后","何故", "先锋","不如","赶来","此人","夫人","先主","后人","背后", "城中","蜀兵","上马","大叫","都督","一人","如何","商议", "却说","不可","不能","如此","将军","二人","后主","荆州", "如何","主公","军马","军士","左右","正是","徐州","忽然", "因此","成都","未知","不见","大败","大事","之后","一军", "起兵","引军","军中","接应","进兵","大惊","可以","大怒", "不得","以为","心中","一声","下文","曹兵","追赶"} counts ={} for word in words: if len(word) == 1: continue elif word == "诸葛亮" or word == "孔明曰": rword = "孔明" elif word == "玄德" or word == "玄德曰": rword = "刘备" elif word == "孟德" or word == "丞相": rword = "曹操" elif word == "关公" or word == "云长": rword = "关羽" else: rword = word counts[rword] = counts.get(rword,0) + 1 for word in nowords: del(counts[word]) items = list(counts.items()) items.sort(key=lambda x:x[1],reverse = True) for i in range(20): word,count = items[i] print("{0:<10}{1:>5}".format(word,count))
|