반응형
/*******************************************************************************************************************
-- Title : [Py3.5] Stemming and Lemmatization w/ NLTK
-- Reference : excelsior-cjh.tistory.com
-- Key word : nlp nltk 자연어 처리 자연어처리 품사 스테밍 stemming stemmer 스테머 lematization
porter lancaster regexp snowball 정규식 정규 표현식 정규표현식 regular expression
lemmatizing
*******************************************************************************************************************/
-- Stemming
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | from nltk.stem.porter import PorterStemmer from nltk.stem.lancaster import LancasterStemmer from nltk.stem.regexp import RegexpStemmer from nltk.stem.snowball import SnowballStemmer # ------------------------------ # -- Stemming # ------------------------------ # -- Poster Stemming Alogorithm stemmer = PorterStemmer() print(stemmer.stem('cooking')) print(stemmer.stem('cookery')) print(stemmer.stem('cooked')) """ cook cookeri cook """ print("[Poster_Stem]", "-"*100, "\n") # -- Lancaster Stemmer Class # Lancaster's perf. is better then Poster. stemmer = LancasterStemmer() print(stemmer.stem('cooking')) print(stemmer.stem('cookery')) print(stemmer.stem('cooked')) """ cook cookery cook """ print("[Lancaster_Stem]", "-"*100, "\n") # -- The RegexpStemmer class # 정규 표현식 기준으로 Stemming # 가능하나 Poster나 Lancaster가 처리하지 못하는 특수부분에서 적절 stemmer = RegexpStemmer('ing') print(stemmer.stem('cooking')) print(stemmer.stem('cookery')) print(stemmer.stem('cooked')) print(stemmer.stem('ingleside')) """ cook cookery cooked leside """ print("[Regexp_Stem]", "-"*100, "\n") # -- The RegexpStemmer class # 영어 외 13개국 언어 Stemming 지원 spanish_stemmer = SnowballStemmer('spanish') print(spanish_stemmer.stem('hola')) """ hol """ print("[Snowball_Stem]", "-"*100, "\n") |
-- Lemmatization
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | from nltk.stem.wordnet import WordNetLemmatizer from nltk.stem.porter import PorterStemmer # ------------------------------ # -- Lemmatizing # ------------------------------ # Stemming과 유사하지만, 동의어 대체와 유사 # Lemma는 root word라는 의미 # -- Lemmatize 처리 lemmatizer = WordNetLemmatizer() print(lemmatizer.lemmatize('cooking')) print(lemmatizer.lemmatize('cooking', pos='v')) print(lemmatizer.lemmatize('cooked')) print(lemmatizer.lemmatize('cooked', pos='v')) print(lemmatizer.lemmatize('cookbooks')) """ cooking cook cooked cook cookbook """ print("[Lemmatize_Stem]", "-"*100, "\n") # -- stemming vs. lematization lemmatizer = WordNetLemmatizer() stemmer = PorterStemmer() print(stemmer.stem('believes')) print(lemmatizer.lemmatize('believes')) """ believe belief """ print(stemmer.stem('cooking')) print(lemmatizer.lemmatize('cooking')) print(lemmatizer.lemmatize('cooking', 'n')) print(lemmatizer.lemmatize('cooking', 'v'))print(lemmatizer.lemmatize('cooking', 'a')) """ cook cooking cooking cook cooking """ print("[Stem vs. Lemmatize]", "-"*100, "\n") |
반응형