반응형

/*******************************************************************************************************************
-- Title : [Py3.5] Custom Sent_Token and POS Tagging w/ NLTK
-- Reference : pythonprogramming.net
-- Key word : nltk nlp pos state_union.raw punktsentencetokenizer tokenize pos tag pos_tag word tokenize
                  word_tokenize sent tokenize sent_tokenize 자연어 처리 형태소 품사 pos tagging
                  word token word_token sent_token sent token custom sent tokenizer
*******************************************************************************************************************/

-- Python

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
 
print("[0]" + "-"*200"\n")
 
# -- train text open
train_file = open('train_text.txt''r')
train_text = train_file.read()
train_file.close
 
print (train_text)
print ("[1]" + "-"*200"\n")
 
# -- sample text open
sample_file = open('sample_text.txt''r')
sample_text = sample_file.read()
sample_file.close
 
print (sample_text)
print ("[2]" + "-"*200"\n")
 
# # -- nltk 제공 데이터셋 호출
# train_text = state_union.raw("2005-GWBush.txt")
# sample_text = state_union.raw("2006-GWBush.txt")
 
# -- Punkt tokenizer Training(Unsupervised ML처리)
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)
 
print(tokenized)
print ("[3]" + "-"*200"\n")
 
# -- Sentences Token별 POS 처리
def process_content():
    try:
        for i in tokenized[:5]:
            print("\n---------------------")
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
 
    except Exception as e:
        print(str(e))
 
 
# -- Execute Function
process_content()
 
 

-- Files

sample_text.txt

train_text.txt


반응형

+ Recent posts