python结巴分词使用

使用flask框架

安装依赖

1
pip3 install Flask Flask_RESTful

helloword实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#!/usr/bin/python3
# encoding:utf-8

from flask import Flask
from flask_restful import Api, Resource

# 初始化app、api
app = Flask(__name__)
api = Api(app)

class api_hello(Resource):
def get(self):
return 'hello word!'

api.add_resource(api_hello, '/')

if __name__ == '__main__':
app.run(host="0.0.0.0", port=8080, debug=False)

请求访问

1
2
curl -X GET localhost:8080
#hello word!

结巴分词引入

1
pip3 install jieba

基于词性分词

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#!/usr/bin/python3
# encoding:utf-8
import re
import jieba.posseg as pseg

def wordcount(string_data):

# 文本预处理
pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"') # 定义正则表达式匹配模式
string_data = re.sub(pattern, '', string_data) # 将符合模式的字符去除

# 基于词性标注的文本分词
seg_list_exact = pseg.cut(string_data)

print(seg_list_exact)
return seg_list_exact

设置停用词

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#!/usr/bin/python3
# encoding:utf-8
import os
import re
import jieba.posseg as pseg

def wordcount(string_data):

# 文本预处理
pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"') # 定义正则表达式匹配模式
string_data = re.sub(pattern, '', string_data) # 将符合模式的字符去除

# 基于词性标注的文本分词
seg_list_exact = pseg.cut(string_data)

object_list = []
# 设置停用词
stopwords_path = os.path.join('.', "cn_stopwords.txt")
remove_words = [line.strip() for line in open(
stopwords_path, "r", encoding="utf-8").readlines()]

for seg in seg_list_exact:
if seg.word not in remove_words: # 如果不在去除词库中
object_list.append(seg) # 分词追加到列表

print(object_list)
return object_list

实现api请求提交文本

app.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/usr/bin/python3
# encoding:utf-8

from flask import Flask, request
from flask_restful import reqparse, abort, Api, Resource
from flask_cors import CORS
from gevent import pywsgi
from wordcount import wordcount

# 初始化app、api
app = Flask(__name__)
api = Api(app)
CORS(app, supports_credentials=True)

parser = reqparse.RequestParser()
parser.add_argument('text', location='json')


class api_word_cloud(Resource):
def post(self):
if not request.json or 'text' not in request.json:
abort(404, message="请求错误")
args = parser.parse_args()
return {
'code': 200,
'data': wordcount(args['text'])
}


api.add_resource(api_word_cloud, '/')

if __name__ == '__main__':
server = pywsgi.WSGIServer(('0.0.0.0', 8080), app)
server.serve_forever()
#app.run(host="0.0.0.0", port=8080, debug=False)

wordcount.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/python3
# encoding:utf-8

# 导入扩展库
import os
import re # 正则表达式库
import collections # 词频统计库
import jieba # 结巴分词
import jieba.posseg as pseg


def wordcount(text):
string_data = text

# 文本预处理
pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"') # 定义正则表达式匹配模式
string_data = re.sub(pattern, '', string_data) # 将符合模式的字符去除

# 基于词性标注的文本分词
seg_list_exact = pseg.cut(string_data)
object_list = []
# 设置停用词
stopwords_path = os.path.join('.', "cn_stopwords.txt")
remove_words = [line.strip() for line in open(
stopwords_path, "r", encoding="utf-8").readlines()]

word_counter = collections.Counter()
part_counter = collections.Counter()
part_words = {}

# 设置词性列表
flags = {
'a': '形容词',
'c': '连词',
'd': '副词',
'e': '叹词',
'f': '方位词',
'm': '数词',
'n': '名词',
'p': '介词',
'q': '量词',
'r': '代词',
's': '处所词',
't': '时间词',
'u': '助词',
'v': '动词',
'un': '未知词'
}

for seg in seg_list_exact: # 循环读出每个分词
if seg.flag not in flags:
continue
if seg.word not in remove_words: # 如果不在去除词库中
object_list.append(seg) # 分词追加到列表
word_counter[seg.word] += 1
part_counter[seg.flag] += 1
if seg.flag not in part_words:
part_words[seg.flag] = []
if seg.word not in part_words[seg.flag]:
part_words[seg.flag].append(seg.word)

part_result_words = {}
part_result_words_flag = []
for flag in part_words:
flag_name = flags[flag]
part_result_words[flag_name] = []
for word in part_words[flag]:
part_result_words[flag_name].append({
'name': word,
'value': word_counter[word]
})
part_result_words_flag.append({
'name': word,
'flag': flag_name,
'value': word_counter[word]
})
part_result_words[flag_name] = sorted(
part_result_words[flag_name], key=lambda x: x['value'], reverse=True)[:10]

part_result_words_flag = sorted(
part_result_words_flag, key=lambda x: x['value'], reverse=True)[:500]

part_result = []
for flag in part_counter:
part_result.append({
'name': flags[flag],
'value': part_counter[flag]
})
part_result = sorted(
part_result, key=lambda x: x['value'], reverse=True)[:10]
return {
'aggs_result': part_result,
'detail_result': part_result_words,
'result': part_result_words_flag
}

使用docker应用

获取依赖定义requirements.txt

1
pip3 install pipreqs
1
pipreqs . --encoding=utf8 --forc

编辑Dockerfile

1
2
3
4
5
6
7
8
FROM python:3.7
WORKDIR /app

ADD . /app

RUN pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && pip3 install -r requirements.txt

CMD ["python", "app.py"]

部署

1
2
docker build -t jieba-app:latest .
docker run --name jieba-app -p 8080:8080 jieba-app:latest

请求

1
2
curl -H "Content-Type: application/json" -X POST -d '{"text":""}' http://localhost:8080
#{"code": 200, "data": {"aggs_result": [], "detail_result": {}, "result": []}}