1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
|
import os import re import collections import jieba import jieba.posseg as pseg
def wordcount(text): string_data = text
pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"') string_data = re.sub(pattern, '', string_data)
seg_list_exact = pseg.cut(string_data) object_list = [] stopwords_path = os.path.join('.', "cn_stopwords.txt") remove_words = [line.strip() for line in open( stopwords_path, "r", encoding="utf-8").readlines()]
word_counter = collections.Counter() part_counter = collections.Counter() part_words = {}
flags = { 'a': '形容词', 'c': '连词', 'd': '副词', 'e': '叹词', 'f': '方位词', 'm': '数词', 'n': '名词', 'p': '介词', 'q': '量词', 'r': '代词', 's': '处所词', 't': '时间词', 'u': '助词', 'v': '动词', 'un': '未知词' }
for seg in seg_list_exact: if seg.flag not in flags: continue if seg.word not in remove_words: object_list.append(seg) word_counter[seg.word] += 1 part_counter[seg.flag] += 1 if seg.flag not in part_words: part_words[seg.flag] = [] if seg.word not in part_words[seg.flag]: part_words[seg.flag].append(seg.word)
part_result_words = {} part_result_words_flag = [] for flag in part_words: flag_name = flags[flag] part_result_words[flag_name] = [] for word in part_words[flag]: part_result_words[flag_name].append({ 'name': word, 'value': word_counter[word] }) part_result_words_flag.append({ 'name': word, 'flag': flag_name, 'value': word_counter[word] }) part_result_words[flag_name] = sorted( part_result_words[flag_name], key=lambda x: x['value'], reverse=True)[:10]
part_result_words_flag = sorted( part_result_words_flag, key=lambda x: x['value'], reverse=True)[:500]
part_result = [] for flag in part_counter: part_result.append({ 'name': flags[flag], 'value': part_counter[flag] }) part_result = sorted( part_result, key=lambda x: x['value'], reverse=True)[:10] return { 'aggs_result': part_result, 'detail_result': part_result_words, 'result': part_result_words_flag }
|