반응형
/*******************************************************************************************************************
-- Title : [Py3.5] Word2Vec Example /w Gensim
-- Reference : medium.com
-- Key word : nlp word2vec gensim moby-dick 자연어 처리 자연어처리 모비딕 mobydick moby-dick
*******************************************************************************************************************/
■ Script
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | # -*- coding: utf-8 -*- import re import pandas as pd import numpy as np from gensim.models import Word2Vec # ------------------------------ # -- 파일 읽기 # ------------------------------ file = open("moby1.txt", "r") moby_dick = file.read() print(moby_dick) print("<raw_doc", "_"*100) # ------------------------------ # -- 문장별로 Split 처리 # ------------------------------ moby_dick = re.split("[\n\.?]", moby_dick) print(moby_dick) print("<split_doc", "_" * 100) # ------------------------------ # -- 공백/빈 리스트 제거 # ------------------------------ while ' ' in moby_dick: moby_dick.remove(' ') moby_dick.remove('') print(moby_dick) print("<remove_blank_doc", "_" * 100) # ------------------------------ # -- 데이터프레임에 저장 # ------------------------------ df_Mobydic = pd.DataFrame() df_Mobydic['sentences'] = np.asarray(moby_dick) print (df_Mobydic) print("<df_doc", "_" * 100) # ------------------------------ # -- 데이터프레임 문장별 Split # ------------------------------ df_Mobydic["separates"] = df_Mobydic["sentences"].apply(lambda x: x.replace(",","")) df_Mobydic["separates"] = df_Mobydic["separates"].apply(lambda x: x.replace(";","")) df_Mobydic["separates"] = df_Mobydic["separates"].apply(lambda x: x.replace("\"","")) df_Mobydic["separates"] = df_Mobydic["separates"].apply(lambda x: x.split()) print (df_Mobydic) print("<df_sep_doc", "_" * 100) # ------------------------------ # -- 문장별 Word2Vec 처리 # ------------------------------ model = Word2Vec(df_Mobydic["separates"], hs=1, size=300, min_count=5) print(model) for word, score in model.most_similar("whale"): print(word) |
■ Files
반응형