반응형
/*******************************************************************************************************************
-- Title : [Py3.5] Tokenizing Words and Sentences w/ NLTK
-- Reference : pythionprogramming.net
-- Key word : 단어 토큰 문장 토큰 tokenize sent_tokenize word_tokenize 불용어 stopword nltk nlp
자연어처리 자연어 처리 sent tokenize word tokenize tokenizing word_token sent_token
word token sent token
*******************************************************************************************************************/
-- Python : Case 1.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords # -- 샘플 문장 EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. " \ "The sky is pinkish-blue. You shouldn't eat cardboard." # ------------------------------ # -- 단어, 문장 분리 # ------------------------------ # -- 문장 분리 토큰 print(sent_tokenize(EXAMPLE_TEXT)) print("-" * 200) # ------------------------------- # # -- 단어 분리 토큰 print(word_tokenize(EXAMPLE_TEXT)) print("-" * 200) # ------------------------------- # | cs |
-- Python : Case 2.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize from nltk.tokenize import WordPunctTokenizer from nltk.tokenize.regexp import RegexpTokenizer from nltk.tokenize.regexp import regexp_tokenize sent = "Welcome to a NLP tutorial series. \n" \ "It's good to see the python: 3-5." print (sent) print ("[raw_sentences]", "-"*100, "\n") # -- sent_tokenize print(sent_tokenize(sent)) print ("[sent_tokenize]", "-"*100, "\n") #-- word_tokenize print(word_tokenize(sent)) print ("[word_tokenize]", "-"*100, "\n") # -- word punktuation tokenizing tokenizer = WordPunctTokenizer() print(tokenizer.tokenize(sent)) print ("[WordPunct_Tokenizer]", "-"*100, "\n") # -- regular expression tokenizing tokenizer = RegexpTokenizer("[\w']+") print(tokenizer.tokenize(sent)) print ("."*20) print(regexp_tokenize(sent, "[\w-]+")) print ("[RegExp_Tokenizer]", "-"*100, "\n") # -- White space tokenizing tokenizer = RegexpTokenizer("\s+", gaps=True) print(tokenizer.tokenize(sent)) print ("[whitespace_tokenize]", "-"*100, "\n") | cs |
반응형