반응형
/*******************************************************************************************************************
-- Title : [NLP] IE(정보 추출) Flow - Tokenizing/POS Tagging/Chunking
-- Reference : www.nltk.org/book/ch07.html(ch01부터)
-- Key word : ie information extractioin 정보 추출 segmentation tokenization pos part of speech entity relation
sent_tokenize word_tokenize pos_tag chunk chunking chunker 청크 청커 mop 형태소
tokenizing chuning pos tagging
*******************************************************************************************************************/
■ IE Architecture Flow
■ NLTK related IE
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | # -*- coding: utf-8 -*- from nltk.tokenize import sent_tokenize, word_tokenize from nltk.tag import pos_tag from nltk.chunk import RegexpParser # ------------------------------ # -- function of tokenization-pos # ------------------------------ def ie_preprocess(document): sent_token = sent_tokenize(document) print("sent_tokenize : ", sent_token) word_token = [word_tokenize(sent) for sent in sent_token] print("word_tokenize : ", word_token) pos_tags = [pos_tag(sent) for sent in word_token] print("pos_tag : ", pos_tags) return (pos_tags) # ------------------------------ # -- raw_text # ------------------------------ raw_text = \ """ The quick brown fox jumps over the lazy dog. He loves his king and his queen. This system is for DATA STORAGE UPON. """ print(raw_text) print("... raw_text", "." * 100, "\n") # ------------------------------ # -- call function # ------------------------------ pos_tags = ie_preprocess(raw_text) print(pos_tags) print("... call_function", "." * 100, "\n") # ------------------------------ # -- chunking # ------------------------------ # -- 문장 개수 추출 sent_cnt = len(pos_tags) print(sent_cnt) print("*** sentence count", "*" * 100, "\n") # -- 청크 정의 및 도식화 chunk_rule = "NP: {<DT>?<JJ>*<NN>}" chunk_parse = RegexpParser(chunk_rule) # -- 청크 추출 for i in pos_tags: print("### pos_tags: ", i) chunk = chunk_parse.parse(i) print(chunk) print("-" * 20) chunk.draw() print("*** sentence count", "*" * 100, "\n") |
반응형