반응형

/*******************************************************************************************************************
-- Title : [Py3.5] Named Entity Recognition w/ NLTK
-- Reference : pythonprogramming.net
-- Key word : nltk nlp named entity recognition corpus state_union state union tokenize punktsentencetokenizer
                  tokenize sent_tokenize word_tokenize sent tokenize word tokenize re_chunk re chunk pos
                 자연어 처리 형태소 품사 청크 엔티티 
*******************************************************************************************************************/

-- Figure


-- Python

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
 
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
 
# ------------------------------
# -- DataSet
# ------------------------------
 
# -- train text open
train_file = open('train_text.txt', 'r')
train_text = train_file.read()
train_file.close
 
print (train_text)
print ("[1:train_text]" + "-"*200)
 
# -- sample text open
sample_file = open('sample_text.txt', 'r')
sample_text = sample_file.read()
sample_file.close
 
print (sample_text)
print ("[2:sample_text]" + "-"*200)
 
# # -- nltk 제공 데이터셋 호출
# train_text = state_union.raw("2005-GWBush.txt")
# sample_text = state_union.raw("2006-GWBush.txt")
 
 
# ------------------------------
# -- Tokenizing
# ------------------------------
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)
 
print (tokenized)
print ("[3:tokenized_sentence]" + "-"*200)
 
 
# ------------------------------
# -- Chunking : Named Entity Recognition
# ------------------------------
def process_content():
    try:
        for i in tokenized:
            # -- 단어 토큰
            print("<tokenized> " + i.replace('\n', '<BR>'))
            words = nltk.word_tokenize(i)
            print("<word_tokenize> ")
            print(words)
            print("-" * 20)
 
            # -- 형태소 출력
            tagged = nltk.pos_tag(words)
            print ("<tagged> ")
            print (tagged)
 
            print("-" * 20)
 
            # -- Name Entity Recognition
            namedEnt = nltk.ne_chunk(tagged, binary=True)
            print ("<nameEnt> ")
            print (namedEnt)
 
            # -- Drawing
            namedEnt.draw()
            print("=" * 50)
    except Exception as e:
        print(str(e))
 
 
process_content()
 

cs


-- Files

sample_text.txt

train_text.txt


반응형

+ Recent posts