반응형
/*******************************************************************************************************************
-- Title : [Py3.5] Word Frequency & TF-IDF Example w/ TextBlob
-- Reference : stevenloria.com/finding-important-words-in-a-document-using-tf-idf/
-- Key word : tf-idf tfidf tf idf textblob text blob n-containing enumerate word frequency word embedding
워드 임베딩 단어 임베딩 빈도 계산 frequency counting
*******************************************************************************************************************/
■ TF-IDF
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | import math from textblob import TextBlob # ------------------------------ # -- BLOB Raw Text # ------------------------------ sent_blob0 = TextBlob("""Pneumatic Radial Tire for Use on Passenger Car.""") sent_blob1 = TextBlob("""Car tire attachment mechanism.""") sent_blob2 = TextBlob(""" Tire actuated generator for use on cars .""") corpus = [sent_blob0, sent_blob1, sent_blob2] for i, sent in enumerate(corpus): print ("[",i,"].raw:", sent) print("... raw_blob_text", "." * 100, "\n") # ------------------------------ # -- BLOB Token-POS-Chunk # ------------------------------ # Ref: dbrang.tistory.com/1199 # # -- word tokenizing and pos tagging # for i, sent in enumerate(corpus): # sent = TextBlob(str(corpus[i])) # print("[", i, "].tag:", sent.tags) # # # -- Noun Phrase(NP) Chunking # for i, sent in enumerate(corpus): # print("[1].NP", corpus[i].noun_phrases) # # print(",,, np chunking", "," * 100, "\n") # ------------------------------ # -- TF-IDF Funtions # ------------------------------ # 특정 단어가 해당 문장에 얼마나 존재하는지의 비율 def tf(word, sent): return sent.words.count(word) / len(sent.words) # 각 문장에 해당 단어를 포함하면 sum+=1 처리 def n_containing(word, corp): return sum(1 for sent in corp if word in sent.words) # 전체 문장 개수 / n_containing의 log값 def idf(word, corp): return math.log(len(corp) / (1 + n_containing(word, corp))) # tf*idf 계산 def tfidf(word, sent, corp): return tf(word, sent) * idf(word, corp) # ------------------------------ # -- TF-IDF Calculation # ------------------------------ for i, blob in enumerate(corpus): print("Top words in sent_blob [{}]".format(i), "- \"", corpus[i], "\"") scores = {word: tfidf(word, blob, corpus) for word in blob.words} sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) for word, score in sorted_words[:3]: # top(3) print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5))) |
반응형