반응형
/*******************************************************************************************************************
-- Title : [Py3.5] Sent_tokenization, Noun-Phrasing and NER w/ spaCy
-- Reference : blog.sharepointexperience.com/2016/01/nlp-and-sharepoint-part-1/
-- Key word : nlp spacy pandas set_option txt open text read sent_token word_token tokenizer sentence
noun phrase chunk chunker chuning tokenizing named entity recognizion ner 자연어처리
텍스트 단어 토큰 문장 토큰 토크나이저 청킹 청크 청커
*******************************************************************************************************************/
■ Scripts
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | import spacy import pandas as pd # Ref : http://blog.sharepointexperience.com/2016/01/nlp-and-sharepoint-part-1/ ######################################## # -- Set Dataframe Option ######################################## pd.set_option('display.max_colwidth', -1) ######################################## # -- Sourcing and Declaration ######################################## # -- # -- REad Source from file # -- fl_source = open("review.txt", "r") source = fl_source.read() print(source[:150]) print("... source", "." * 100, "\n") # -- # -- Making document # -- nlp = spacy.load("en") doc = nlp(source) ######################################## # -- Sentence Tokenizing ######################################## lst_sent = [sent_token.orth_ for sent_token in doc.sents] df_sent = pd.DataFrame(lst_sent) print(lst_sent) print(",,, list_sent_token", "," * 100, "\n") print(df_sent.head(5)) print(",,, df_sent_token", "," * 100, "\n") ######################################## # -- Noun Phrase ######################################## lst_nounphrase = [[nchunk.orth_, nchunk, nchunk.root, nchunk.root.head.orth_] for nchunk in doc.noun_chunks] df_nounphrase = pd.DataFrame(lst_nounphrase) print(lst_nounphrase) print(";;; lst_nounphrase", ";" * 100, "\n") print(df_nounphrase.head(5)) print(";;; df_nounphrase", ";" * 100, "\n") ######################################## # -- Named Entity Recognition(NER) ######################################## entities = list(doc.ents) print("There were {} entities found".format(len(entities))) lst_ner_org_people = [entity.orth_ for entity in entities if entity.label_ in ['ORG','PERSON']] df_ner_org_people = pd.DataFrame(lst_ner_org_people) print(df_ner_org_people) print("^^^ df_ner", "^" * 100, "\n") | cs |
■ Files
반응형