반응형
/*******************************************************************************************************************
-- Title : [Py3.5] Count Vector & TF-IDF /w Scikit-Learn - ver.dBRang
-- Reference : scikit-learn.org/stable/modules/feature_extraction.html
-- Key word : sklearn scikit-learn scikit learn vector vectorization tf-idf tfidf 벡터 vectorizing count vector
countvectorizer word embedding 워드 임베딩 단어 임베딩 word vector word vectors
word features word feature fit_transform word vectorizer word vectorizing
*******************************************************************************************************************/
■ Word Vectorization & TF-IDF
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | from sklearn.feature_extraction.text import CountVectorizer # ======================================= # -- Data Set # ======================================= train_set = ("The sky is blue.", "The sun is bright.") test_set = ("The sun in the sky is bright.", "We can see the shining sun, the bright sun.") # ======================================= # -- Declare CountVectorizer # ======================================= # WordNGramAnalyzer 기본 사용 # 기본으로 모두 소문화로 전환하여 처리 # min_df는 최소 빈도 회수 지정(이상인 단어만 처리) vectorizer = CountVectorizer(min_df=1) # 기본 처리기, 소문자화.. 여기에 Default # 필요시 사용자 Tokenizer 사용 : dbrang.tistory.com/1248 print(vectorizer) print("... CountVectorizer", "." * 100, "\n") # ======================================= # -- Make Feature Matrix # ======================================= # -- 단어 토큰화 및 카운팅 vectorizer.fit_transform(train_set) # -- 단어 토큰 출력 word_token = vectorizer.get_feature_names() for index in range(len(word_token)): print(index, word_token[index]) print(",,, word_token (vectorizer.get_feature_names)", "," * 100, "\n") # -- 검증 analyze = vectorizer.build_analyzer() print(analyze("This is a text document to analyze.") == (['this', 'is', 'text', 'document', 'to', 'analyze'])) print(",,, vectorizer.build_analyzer", "," * 100, "\n") # ======================================= # -- CountVectorizer # ======================================= smatrix = vectorizer.transform(test_set) print("train_set : ", train_set) print("test_set : ", test_set) print(smatrix) print(";;; vector & count (vectorizer.transform)", ";" * 100, "\n") """ vectorizer.get_feature_names : ((0, blue), (1, bright), (2, is), (3, sky), (4, sun), (5, the)) train_set : ('The sky is blue.', 'The sun is bright.') test_set : ('The sun in the sky is bright.', 'We can see the shining sun, the bright sun.') (0, 1) 1 (1, bright) (0, 2) 1 (2, is) (0, 3) 1 (3, sky) (0, 4) 1 (4, sun) (0, 5) 2 (5, the) (1, 1) 1 (1, bright) (1, 4) 2 (4, sun) (1, 5) 2 (5, the) """ print(smatrix.todense()) print(";;; matrix (smatrix.todense)", ";" * 100, "\n") """ [[0 1 1 1 1 2] [0 1 0 0 2 2]] """ print(smatrix.toarray().transpose()) print(";;; matrix (smatrix.toarray)", ";" * 100, "\n") """ [[0 0] [1 1] [1 0] [1 0] [1 2] [2 2]] """ # ======================================= # -- TF-IDF # ======================================= from sklearn.feature_extraction.text import TfidfTransformer transformer = TfidfTransformer(smooth_idf=False) print(transformer) counts = [[3, 0, 1], [2, 0, 0], [3, 0, 0], [4, 0, 0], [3, 2, 0], [3, 0, 2]] tfidf = transformer.fit_transform(counts) print(tfidf) print(tfidf.toarray()) print("*** tfidf.toarray()", "*" * 100, "\n") | cs |
반응형