[Py3.5] POS Tagging and Lemmatizing - ver.dbrang

2017. 5. 2. 16:27

/*******************************************************************************************************************
-- Title : [Py3.5] POS Tagging and Lemmatizing - ver.dbrang
-- Key word : nlp mop pos tagging pos tag lemmatizing lemmatize lemma 자연어처리 자연어 처리 형태소 분석
형태소분석 part of speeching 품사 nltk pandas dataframe 데이터 프레임 데이터프레임 pos_tag
wordnetlemmatizer lemmatizer
*******************************************************************************************************************/

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93

# -*- coding: utf-8 -*-
 
import re
import pandas as pd
from pandas import Series, DataFrame
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.tokenize.regexp import regexp_tokenize
from nltk.tag import pos_tag
 
 
# ------------------------------
# -- Set Dataframe Option
# ------------------------------
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
 
 
# ------------------------------
# -- Sourcing
# ------------------------------
# Tokenizing - Stopwords - Dataframe
# Reference : dbrang.tistory.com/1184
 
 
# ------------------------------
# -- Create Dataframe with Sample
# ------------------------------
dict_sample = \
    {'a_sent': ['The quick brown fox jumps over the lazy dog',
                'He loves his king and his queen.',
                'This system is for DATA STORAGE UPON'],
     'b_token_word': [['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'],
                      ['He', 'loves', 'his', 'king', 'and', 'his', 'queen'],
                      ['This', 'system', 'is', 'for', 'DATA', 'STORAGE', 'UPON']],
     'c_filter_word': [['The', 'quick', 'brown', 'fox', 'jumps', 'lazy', 'dog'],
                       ['He', 'loves', 'king', 'queen'],
                       ['This', 'system', 'DATA', 'STORAGE', 'UPON']]}
 
df_word = DataFrame(dict_sample)
 
print(df_word)
print("... df_word", "." * 100, "\n")
 
 
# ------------------------------
# -- Pos Tagging
# ------------------------------
df_word["d_pos_tag"] = df_word["c_filter_word"].apply(lambda x: pos_tag(x))
 
print(df_word[["c_filter_word", "d_pos_tag"]])
print(",,, Filter by Stopwords", "," * 100, "\n")
 
 
# ------------------------------
# -- Lemmatizing
# ------------------------------
 
# --
# -- 처리 대상 선정
# --
# POS 첫 글자(tag[0])만 처리함 : 명사(N), 동사(V), 형용사(J), 부사(R) - 원어를 처리하니 NVJR만 해당하지 않을까..
# 참조 : dbrang.tistory.com/1139
# Lemmatizer는 형용사('j')를 'a'로 인식하기에 변환
lst_pos = list('NVJR')
fn_j2a = lambda tag: tag if tag != 'j' else 'a'
 
df_word["e_pos_tag2"] = df_word["d_pos_tag"].apply(
    lambda x: [(word, fn_j2a(pos[0].lower())) for word, pos in x if pos[0] in lst_pos])
 
print(df_word[["d_pos_tag", "e_pos_tag2"]])
print(";;; Change to POS Tag[0]", ";" * 100, "\n")
 
# --
# -- Lemmatize
# --
lemmatizer = WordNetLemmatizer()
df_word["f_lemma"] = df_word["e_pos_tag2"].apply(lambda x: [(lemmatizer.lemmatize(word, pos), pos) for word, pos in x])
 
print(df_word[["d_pos_tag", "e_pos_tag2", "f_lemma"]])
print(";;; Lemmatizing", ";" * 100, "\n")
 
# --
# -- Extract only lemmatized word
# --
df_word["g_lemmaword"] = df_word["f_lemma"].apply(lambda x: [w for w, p in x])
 
print(df_word[["d_pos_tag", "f_lemma", "g_lemmaword"]])
print(";;; Lemmatizing only printing word", ";" * 100, "\n")
 

 

저작자표시 비영리 변경금지 (새창열림)

디비랑[dɪ'bɪraŋ]

[Py3.5] POS Tagging and Lemmatizing - ver.dbrang

+ Recent posts

티스토리툴바