반응형
/*******************************************************************************************************************
-- Title : [Py3.5] Custom Sent_Token and POS Tagging w/ NLTK
-- Reference : pythonprogramming.net
-- Key word : nltk nlp pos state_union.raw punktsentencetokenizer tokenize pos tag pos_tag word tokenize
word_tokenize sent tokenize sent_tokenize 자연어 처리 형태소 품사 pos tagging
word token word_token sent_token sent token custom sent tokenizer
*******************************************************************************************************************/
-- Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer print("[0]" + "-"*200, "\n") # -- train text open train_file = open('train_text.txt', 'r') train_text = train_file.read() train_file.close print (train_text) print ("[1]" + "-"*200, "\n") # -- sample text open sample_file = open('sample_text.txt', 'r') sample_text = sample_file.read() sample_file.close print (sample_text) print ("[2]" + "-"*200, "\n") # # -- nltk 제공 데이터셋 호출 # train_text = state_union.raw("2005-GWBush.txt") # sample_text = state_union.raw("2006-GWBush.txt") # -- Punkt tokenizer Training(Unsupervised ML처리) custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) print(tokenized) print ("[3]" + "-"*200, "\n") # -- Sentences Token별 POS 처리 def process_content(): try: for i in tokenized[:5]: print("\n---------------------") words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) print(tagged) except Exception as e: print(str(e)) # -- Execute Function process_content() |
-- Files
반응형