반응형
/*******************************************************************************************************************
-- Title : [Py3.5] Chunking and Chunking Rule w/ NLTK - ver.dBRang
-- Reference : dbrang
-- Key word : nltk mop pos word tokenize sent tokenize token 토큰 토크나이징 chunk rule chunking
regexpparser 자연어 처리 형태소 pos tagging pos_tag 청크 청킹 토크나이즈 tokenizing
*******************************************************************************************************************/
■ Figure
■ Chunking and Chunking Rule
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | # -*- coding: utf-8 -*- import re import pandas as pd from pandas import Series, DataFrame from nltk.tokenize import sent_tokenize, word_tokenize from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords from nltk.tokenize.regexp import RegexpTokenizer from nltk.tokenize.regexp import regexp_tokenize from nltk.tag import pos_tag from nltk.chunk import RegexpParser # ------------------------------ # -- Set Dataframe Option # ------------------------------ pd.set_option('display.height', 1000) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) # ------------------------------ # -- Create Dataframe with Sample # ------------------------------ # Token : dbrang.tistory.com/1183 dict_sample = \ {'a_sent': ['The quick brown fox jumps over the lazy dog', 'He loves his king and his queen.', 'This system is for DATA STORAGE UPON'], 'b_token_word': [['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'], ['He', 'loves', 'his', 'king', 'and', 'his', 'queen'], ['This', 'system', 'is', 'for', 'DATA', 'STORAGE', 'UPON']]} df_samp = DataFrame(dict_sample) print (df_samp) print("... df_samp.sample dataframe", "." * 100, "\n") # ------------------------------ # -- Add POS Tagged Column # ------------------------------ # POS : dbrang.tistory.com/1194 df_samp["c_pos_tag"] = df_samp["b_token_word"].apply(lambda x: pos_tag(x)) print (df_samp[["b_token_word", "c_pos_tag"]]) print(",,, df_samp.add pos_tag()", "," * 100, "\n") # ------------------------------ # -- Chunking # ------------------------------ # Chunk : dbrang.tistory.com/1193 # -- Chunk Rule 정의 chunk_rule = "NP: {<DT>?<JJ>*<NN>}" chunk_parse = RegexpParser(chunk_rule) # -- 청크 추출 df_samp["d_chunk"] = df_samp["c_pos_tag"].apply(lambda x: chunk_parse.parse(x)) print (df_samp[["c_pos_tag", "d_chunk"]]) print(",,, df_samp.add chunk", "," * 100, "\n") # ------------------------------ # -- Draw Chunk # ------------------------------ for chunk in df_samp["d_chunk"]: print (chunk) print ("-" * 30) chunk.draw() print("--- print and draw chunk", "-" * 100, "\n") |
반응형