반응형
/*******************************************************************************************************************
-- Title : [Py3.5] User Customized Tokenizer w/ spaCy - ver.dBRang
-- Key word : nlp word_token tokenizer tokenizing tokenization coun chunk chuning chunker pos_tag
pos_tagging pos_tagger lemma lemmatizing lemmatization 워드 토큰 단어 토큰 토크나이징
청크 청커 품사 태크 태깅 사용자 토크나이저 my_tokenizer custom tokenizer cust_tokenizer
*******************************************************************************************************************/
■ Script
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
from time import time
import re
from nltk.corpus import stopwords
from spacy import load
from spacy.tokens import Doc
# =======================================
# -- Load Source
# =======================================
# --
# -- Load Source
# --
source = "spaCy is an open-source software library for advanced Natural-Language-Processing, " \
"written in the programming languages Python -Cython 4.5.678. " \
"It offers the fastest syntactic parser in the world. " \
"The library is published under the MIT license and currently supports English and German, " \
"as well as tokenization for Chinese and several other languages. " \
"Unlike NLTK, which is mainly * intended for teaching and research, " \
"spaCy focuses on providing software for production usage. As of version 11.0, " \
"spaCy also supports deep learning workflows that allow connecting-statistical-models trained " \
"by popular machine learning libraries like TensorFlow, Keras or Scikit-learn. " \
"spaCy's machine learning library, Thinc, is also available as a separate open-source Python library."
print(source[:150], "...")
print("... source", "." * 100, "\n")
# =======================================
# -- Create Customized Tokenizer
# =======================================
class cust_tokenizer(object):
def __init__(self, nlp):
self.vocab = nlp.vocab
def __call__(self, text):
words = re.findall(pattern='[a-zA-Z][-a-zA-Z]+', string=text.lower())
return Doc(self.vocab, words=words)
# =======================================
# -- Load Model
# =======================================
nlp1 = load('en')
nlp2 = load('en', create_make_doc = cust_tokenizer)
doc1 = nlp1(source)
doc2 = nlp2(source)
# =======================================
# -- Word Tokenization (Ref: dbrang.tistory.com/1244)
# 토크나이징은 cust_tokenizer가 효과적(원하는대로 뽑음)
# =======================================
lst_word_token1 = list(doc1)
lst_word_token2 = list(doc2)
print("lst_word_token1: ", lst_word_token1)
print("lst_word_token2: ", lst_word_token2)
print(",,, word_token from custom tokenizer", "," * 100, "\n")
# =======================================
# -- word_token/lemma/pos_tag
# 토크나이징은 cust_tokenizer가 효과적이지만 POS가 많이 왜곡됨 -> 이로인해 청크도 왜곡 되는 듯 함.
# =======================================
lst_wtoken_lemma_pos1 = [(wtoken.string + '/' + wtoken.lemma_ + '/' + wtoken.pos_) for wtoken in doc1]
lst_wtoken_lemma_pos2 = [(wtoken.string + '/' + wtoken.lemma_ + '/' + wtoken.pos_) for wtoken in doc2]
print("lst_wtoken_lemma_pos1: ", lst_wtoken_lemma_pos1)
print("lst_wtoken_lemma_pos2: ", lst_wtoken_lemma_pos2)
print(",,, word_token/lemma/pos_tag", "," * 100, "\n")
# =======================================
# -- noun chunking
# 청킹은 default tokenizer를 통한 결과가 상당히 정확함.
# =======================================
lst_noun_chunk1 = [(chunk.string) for chunk in doc1.noun_chunks]
lst_noun_chunk2 = [(chunk.string) for chunk in doc2.noun_chunks]
print("lst_noun_chunk1: ", lst_noun_chunk1)
print("lst_noun_chunk2: ", lst_noun_chunk2)
|
cs |
반응형