반응형

/*******************************************************************************************************************
-- Title : [Py3.5] Chunking and Chunking Rule w/ NLTK - ver.dBRang
-- Reference : dbrang
-- Key word : nltk mop pos word tokenize sent tokenize token 토큰 토크나이징 chunk rule chunking 
                  regexpparser 자연어 처리 형태소 pos tagging pos_tag 청크 청킹 토크나이즈 tokenizing
*******************************************************************************************************************/

■ Figure

  


■ 
Chunking and Chunking Rule

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# -*- coding: utf-8 -*-
 
import re
import pandas as pd
from pandas import Series, DataFrame
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.tokenize.regexp import regexp_tokenize
from nltk.tag import pos_tag
from nltk.chunk import RegexpParser
 
# ------------------------------
# -- Set Dataframe Option
# ------------------------------
pd.set_option('display.height'1000)
pd.set_option('display.max_rows'500)
pd.set_option('display.max_columns'500)
pd.set_option('display.width'1000)
 
# ------------------------------
# -- Create Dataframe with Sample
# ------------------------------
dict_sample = \
    {'a_sent': ['The quick brown fox jumps over the lazy dog',
                'He loves his king and his queen.',
                'This system is for DATA STORAGE UPON'],
     'b_token_word': [['The''quick''brown''fox''jumps''over''the''lazy''dog'],
                      ['He''loves''his''king''and''his''queen'],
                      ['This''system''is''for''DATA''STORAGE''UPON']]}
 
df_samp = DataFrame(dict_sample)
 
print (df_samp)
print("... df_samp.sample dataframe""." * 100"\n")
 
 
# ------------------------------
# -- Add POS Tagged Column
# ------------------------------
# POS : dbrang.tistory.com/1194
df_samp["c_pos_tag"= df_samp["b_token_word"].apply(lambda x: pos_tag(x))
 
print (df_samp[["b_token_word""c_pos_tag"]])
print(",,, df_samp.add pos_tag()""," * 100"\n")
 
 
# ------------------------------
# -- Chunking
# ------------------------------
 
# -- Chunk Rule 정의
chunk_rule = "NP: {<DT>?<JJ>*<NN>}"
chunk_parse = RegexpParser(chunk_rule)
 
# -- 청크 추출
df_samp["d_chunk"= df_samp["c_pos_tag"].apply(lambda x: chunk_parse.parse(x))
 
print (df_samp[["c_pos_tag""d_chunk"]])
print(",,, df_samp.add chunk""," * 100"\n")
 
 
# ------------------------------
# -- Draw Chunk
# ------------------------------
for chunk in df_samp["d_chunk"]:
    print (chunk)
    print ("-" * 30)
    chunk.draw()
 
print("--- print and draw chunk""-" * 100"\n")
 
 


반응형

+ Recent posts