반응형
/*******************************************************************************************************************
-- Title : [Py3.5] Named Entity Recognition w/ NLTK
-- Reference : pythonprogramming.net
-- Key word : nltk nlp named entity recognition corpus state_union state union tokenize punktsentencetokenizer
tokenize sent_tokenize word_tokenize sent tokenize word tokenize re_chunk re chunk pos
자연어 처리 형태소 품사 청크 엔티티
*******************************************************************************************************************/
-- Figure
-- Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer # ------------------------------ # -- DataSet # ------------------------------ # -- train text open train_file = open('train_text.txt', 'r') train_text = train_file.read() train_file.close print (train_text) print ("[1:train_text]" + "-"*200) # -- sample text open sample_file = open('sample_text.txt', 'r') sample_text = sample_file.read() sample_file.close print (sample_text) print ("[2:sample_text]" + "-"*200) # # -- nltk 제공 데이터셋 호출 # train_text = state_union.raw("2005-GWBush.txt") # sample_text = state_union.raw("2006-GWBush.txt") # ------------------------------ # -- Tokenizing # ------------------------------ custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) print (tokenized) print ("[3:tokenized_sentence]" + "-"*200) # ------------------------------ # -- Chunking : Named Entity Recognition # ------------------------------ def process_content(): try: for i in tokenized: # -- 단어 토큰 print("<tokenized> " + i.replace('\n', '<BR>')) words = nltk.word_tokenize(i) print("<word_tokenize> ") print(words) print("-" * 20) # -- 형태소 출력 tagged = nltk.pos_tag(words) print ("<tagged> ") print (tagged) print("-" * 20) # -- Name Entity Recognition namedEnt = nltk.ne_chunk(tagged, binary=True) print ("<nameEnt> ") print (namedEnt) # -- Drawing namedEnt.draw() print("=" * 50) except Exception as e: print(str(e)) process_content() | cs |
-- Files
반응형