반응형

 

/*******************************************************************************************************************
-- Title : [Py3.5] POS Tagging and Lemmatizing - ver.dbrang
-- Key word : nlp mop pos tagging pos tag lemmatizing lemmatize lemma 자연어처리 자연어 처리 형태소 분석
                  형태소분석 part of speeching 품사 nltk pandas dataframe 데이터 프레임 데이터프레임 pos_tag
                  wordnetlemmatizer lemmatizer  
*******************************************************************************************************************/

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding: utf-8 -*-
 
import re
import pandas as pd
from pandas import Series, DataFrame
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.tokenize.regexp import regexp_tokenize
from nltk.tag import pos_tag
 
 
# ------------------------------
# -- Set Dataframe Option
# ------------------------------
pd.set_option('display.height'1000)
pd.set_option('display.max_rows'500)
pd.set_option('display.max_columns'500)
pd.set_option('display.width'1000)
 
 
# ------------------------------
# -- Sourcing
# ------------------------------
# Tokenizing - Stopwords - Dataframe
# Reference : dbrang.tistory.com/1184
 
 
# ------------------------------
# -- Create Dataframe with Sample
# ------------------------------
dict_sample = \
    {'a_sent': ['The quick brown fox jumps over the lazy dog',
                'He loves his king and his queen.',
                'This system is for DATA STORAGE UPON'],
     'b_token_word': [['The''quick''brown''fox''jumps''over''the''lazy''dog'],
                      ['He''loves''his''king''and''his''queen'],
                      ['This''system''is''for''DATA''STORAGE''UPON']],
     'c_filter_word': [['The''quick''brown''fox''jumps''lazy''dog'],
                       ['He''loves''king''queen'],
                       ['This''system''DATA''STORAGE''UPON']]}
 
df_word = DataFrame(dict_sample)
 
print(df_word)
print("... df_word""." * 100"\n")
 
 
# ------------------------------
# -- Pos Tagging
# ------------------------------
df_word["d_pos_tag"= df_word["c_filter_word"].apply(lambda x: pos_tag(x))
 
print(df_word[["c_filter_word""d_pos_tag"]])
print(",,, Filter by Stopwords""," * 100"\n")
 
 
# ------------------------------
# -- Lemmatizing
# ------------------------------
 
# --
# -- 처리 대상 선정
# --
# POS 첫 글자(tag[0])만 처리함 : 명사(N), 동사(V), 형용사(J), 부사(R) - 원어를 처리하니 NVJR만 해당하지 않을까..
# Lemmatizer는 형용사('j')를 'a'로 인식하기에 변환
lst_pos = list('NVJR')
fn_j2a = lambda tag: tag if tag != 'j' else 'a'
 
df_word["e_pos_tag2"= df_word["d_pos_tag"].apply(
    lambda x: [(word, fn_j2a(pos[0].lower())) for word, pos in x if pos[0in lst_pos])
 
print(df_word[["d_pos_tag""e_pos_tag2"]])
print(";;; Change to POS Tag[0]"";" * 100"\n")
 
# --
# -- Lemmatize
# --
lemmatizer = WordNetLemmatizer()
df_word["f_lemma"= df_word["e_pos_tag2"].apply(lambda x: [(lemmatizer.lemmatize(word, pos), pos) for word, pos in x])
 
print(df_word[["d_pos_tag""e_pos_tag2""f_lemma"]])
print(";;; Lemmatizing"";" * 100"\n")
 
# --
# -- Extract only lemmatized word
# --
df_word["g_lemmaword"= df_word["f_lemma"].apply(lambda x: [w for w, p in x])
 
print(df_word[["d_pos_tag""f_lemma""g_lemmaword"]])
print(";;; Lemmatizing only printing word"";" * 100"\n")
 
 

 

 

반응형

+ Recent posts