반응형

/*******************************************************************************************************************
-- Title : [Py3.5] Stemming and Lemmatization w/ NLTK
-- Reference : excelsior-cjh.tistory.com
-- Key word : nlp nltk 자연어 처리 자연어처리 품사 스테밍 stemming stemmer 스테머 lematization
                  porter lancaster regexp snowball 정규식 정규 표현식 정규표현식 regular expression
                  lemmatizing
*******************************************************************************************************************/

-- Stemming

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.regexp import RegexpStemmer
from nltk.stem.snowball import SnowballStemmer
 
# ------------------------------
# -- Stemming
# ------------------------------
 
# -- Poster Stemming Alogorithm
stemmer = PorterStemmer()
print(stemmer.stem('cooking'))
print(stemmer.stem('cookery'))
print(stemmer.stem('cooked'))
"""
cook
cookeri
cook
"""
 
print("[Poster_Stem]""-"*100"\n")
 
 
# -- Lancaster Stemmer Class
#    Lancaster's perf. is better then Poster.
stemmer = LancasterStemmer()
print(stemmer.stem('cooking'))
print(stemmer.stem('cookery'))
print(stemmer.stem('cooked'))
"""
cook
cookery
cook
"""
 
print("[Lancaster_Stem]""-"*100"\n")
 
 
# -- The RegexpStemmer class
#    정규 표현식 기준으로 Stemming
#    가능하나 Poster나 Lancaster가 처리하지 못하는 특수부분에서 적절
stemmer = RegexpStemmer('ing')
print(stemmer.stem('cooking'))
print(stemmer.stem('cookery'))
print(stemmer.stem('cooked'))
print(stemmer.stem('ingleside'))
"""
cook
cookery
cooked
leside
"""
 
print("[Regexp_Stem]""-"*100"\n")
 
 
# -- The RegexpStemmer class
#    영어 외 13개국 언어 Stemming 지원
 
spanish_stemmer = SnowballStemmer('spanish')
print(spanish_stemmer.stem('hola'))
"""
hol
"""
 
print("[Snowball_Stem]""-"*100"\n")
 
 



-- Lemmatization

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
 
# ------------------------------
# -- Lemmatizing
# ------------------------------
#    Stemming과 유사하지만, 동의어 대체와 유사
#    Lemma는 root word라는 의미
 
# -- Lemmatize 처리
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('cooking'))
print(lemmatizer.lemmatize('cooking', pos='v'))
print(lemmatizer.lemmatize('cooked'))
print(lemmatizer.lemmatize('cooked', pos='v'))
print(lemmatizer.lemmatize('cookbooks'))
"""
cooking
cook
cooked
cook
cookbook
"""
 
print("[Lemmatize_Stem]""-"*100"\n")
 
# -- stemming vs. lematization
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
 
print(stemmer.stem('believes'))
print(lemmatizer.lemmatize('believes'))
"""
believe
belief
"""
 
print(stemmer.stem('cooking'))
print(lemmatizer.lemmatize('cooking'))
print(lemmatizer.lemmatize('cooking''n'))
print(lemmatizer.lemmatize('cooking''v'))print(lemmatizer.lemmatize('cooking''a'))
 
"""
cook
cooking
cooking
cook
cooking
"""
 
print("[Stem vs. Lemmatize]""-"*100"\n")
 
 
 
 


반응형

+ Recent posts