반응형
/*******************************************************************************************************************
-- Title : [Py3.5] POS Tagging and Lemmatizing - ver.dbrang
-- Key word : nlp mop pos tagging pos tag lemmatizing lemmatize lemma 자연어처리 자연어 처리 형태소 분석
형태소분석 part of speeching 품사 nltk pandas dataframe 데이터 프레임 데이터프레임 pos_tag
wordnetlemmatizer lemmatizer
*******************************************************************************************************************/
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
# -*- coding: utf-8 -*-
import re
import pandas as pd
from pandas import Series, DataFrame
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.tokenize.regexp import regexp_tokenize
from nltk.tag import pos_tag
# ------------------------------
# -- Set Dataframe Option
# ------------------------------
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# ------------------------------
# -- Sourcing
# ------------------------------
# Tokenizing - Stopwords - Dataframe
# Reference : dbrang.tistory.com/1184
# ------------------------------
# -- Create Dataframe with Sample
# ------------------------------
dict_sample = \
{'a_sent': ['The quick brown fox jumps over the lazy dog',
'He loves his king and his queen.',
'This system is for DATA STORAGE UPON'],
'b_token_word': [['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'],
['He', 'loves', 'his', 'king', 'and', 'his', 'queen'],
['This', 'system', 'is', 'for', 'DATA', 'STORAGE', 'UPON']],
'c_filter_word': [['The', 'quick', 'brown', 'fox', 'jumps', 'lazy', 'dog'],
['He', 'loves', 'king', 'queen'],
['This', 'system', 'DATA', 'STORAGE', 'UPON']]}
df_word = DataFrame(dict_sample)
print(df_word)
print("... df_word", "." * 100, "\n")
# ------------------------------
# -- Pos Tagging
# ------------------------------
df_word["d_pos_tag"] = df_word["c_filter_word"].apply(lambda x: pos_tag(x))
print(df_word[["c_filter_word", "d_pos_tag"]])
print(",,, Filter by Stopwords", "," * 100, "\n")
# ------------------------------
# -- Lemmatizing
# ------------------------------
# --
# -- 처리 대상 선정
# --
# POS 첫 글자(tag[0])만 처리함 : 명사(N), 동사(V), 형용사(J), 부사(R) - 원어를 처리하니 NVJR만 해당하지 않을까..
# 참조 : dbrang.tistory.com/1139
# Lemmatizer는 형용사('j')를 'a'로 인식하기에 변환
lst_pos = list('NVJR')
fn_j2a = lambda tag: tag if tag != 'j' else 'a'
df_word["e_pos_tag2"] = df_word["d_pos_tag"].apply(
lambda x: [(word, fn_j2a(pos[0].lower())) for word, pos in x if pos[0] in lst_pos])
print(df_word[["d_pos_tag", "e_pos_tag2"]])
print(";;; Change to POS Tag[0]", ";" * 100, "\n")
# --
# -- Lemmatize
# --
lemmatizer = WordNetLemmatizer()
df_word["f_lemma"] = df_word["e_pos_tag2"].apply(lambda x: [(lemmatizer.lemmatize(word, pos), pos) for word, pos in x])
print(df_word[["d_pos_tag", "e_pos_tag2", "f_lemma"]])
print(";;; Lemmatizing", ";" * 100, "\n")
# --
# -- Extract only lemmatized word
# --
df_word["g_lemmaword"] = df_word["f_lemma"].apply(lambda x: [w for w, p in x])
print(df_word[["d_pos_tag", "f_lemma", "g_lemmaword"]])
print(";;; Lemmatizing only printing word", ";" * 100, "\n")
|
반응형