반응형
/*******************************************************************************************************************
-- Title : [Py3.5] Basic NLP function example w/ spaCy
-- Reference : spacy.io
-- Key word : spacy nlp url urllib word_token tokenizing tokenization sent_token pos_tagging lemmatizing
lemmatization noun chunking chunk 자연어처리 토큰 토크아니징 품사 태깅 렘마 청크 청킹
tokenizer lemmatizer chunker
*******************************************************************************************************************/
■ Scripts
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | import spacy import pandas as pd # ======================================= # -- Set Dataframe Option # ======================================= pd.set_option('display.max_colwidth', -1) # ======================================= # -- Sourcing # ======================================= source = """Spacy is the most effective library on NLP. The quick brown fox jumps over the lazy dog. He loves his king and his queen. This system is for DATA STORAGE UPON. Rami Eid is studying at Stony Brook University in New York.""" # ======================================= # -- Pipleline and Properties # ======================================= # -- # -- Create nlp object # -- nlp = spacy.load("en") print(nlp) print("... spacy_load(en)", "." * 100, "\n") # -- # -- Modeling nlp() # -- nlp_doc = nlp(source) print(type(nlp_doc)) print("... type(nlp_doc)", "." * 100, "\n") print(nlp_doc) print("... print(nlp_doc)", "." * 100, "\n") # -- # -- Properties # -- # -- See dir() print(dir(nlp_doc)) print("... dir(nlp_doc)", "." * 100, "\n") # -- nlp_doc to list lst_doc = list(nlp_doc) print(type(lst_doc)) print("... type(list(nlp_doc))", "." * 100, "\n") print("lst_doc[:5] >", lst_doc[:5]) print("nlp_doc[:5] >", nlp_doc[:5]) print("... lst_doc[:5] vs nlp_doc[:5]", "." * 100, "\n") # ======================================= # -- Sentence tokenization # ======================================= # -- # -- sent_token # -- for idx, sent_token in enumerate(nlp_doc.sents): print(idx, "--->", sent_token) print(",,, sent_token with idx", "," * 100, "\n") # -- # -- To dataframe from sent_token # -- df_sent = [sent_token.orth_ for sent_token in nlp_doc.sents] print(pd.DataFrame(df_sent[:10])) print(",,, to dataframe from sent_token ", "," * 100, "\n") # ======================================= # -- Word Tokenization # ======================================= for word_token in nlp_doc: print(word_token) print(";;; word_token", ";" * 100, "\n") for idx, word_token2 in enumerate(nlp_doc): print(idx, "---> ", word_token2) print(";;; word_token with idx", ";" * 100, "\n") # ======================================= # -- POS Tagging # ======================================= for word_token in nlp_doc: print("word_token:", word_token, " pos:", word_token.pos, " pos_:", word_token.pos_, " tag:", word_token.tag, " tag_:", word_token.tag_) print("^^^ pos_tagging", "^" * 100, "\n") # ======================================= # -- Lemmatization # ======================================= for word_token in nlp_doc: print("word_token:", word_token, " lemma:", word_token.lemma, " lemma_:", word_token.lemma_) print("*** lemmatizing", "*" * 100, "\n") # ======================================= # -- Named Entity Recognizer # ======================================= for entity in nlp_doc.ents: print("entity:", entity, " label:", entity.label, " label_:", entity.label_) print(">>> named_entity_recognizing", ">" * 100, "\n") # ======================================= # -- Noun Chunking # ======================================= for chunk in nlp_doc.noun_chunks: print("chunk:", chunk) print("!!! noun_chunking", "!" * 100, "\n") # ======================================= # -- Word Vector # ======================================= nlp_doc2 = nlp(u"""spacy is an open-source software library for advanced Natural Language Processing, written in the programming languages Python and Cython. It offers the fastest syntactic parser in the world. The library is published under the MIT license and currently supports English and German, as well as tokenization for Chinese and several other languages. Unlike NLTK, which is mainly intended for teaching and research, spaCy focuses on providing software for production usage. As of version 1.0, spaCy also supports deep learning workflows that allow connecting statistical models trained by popular machine learning libraries like TensorFlow, Keras or Scikit-learn. spaCy's machine learning library, Thinc, is also available as a separate open-source Python library.""") spacy = nlp_doc2[0] library = nlp_doc2[7] python = nlp_doc2[20] nltk = nlp_doc2[64] print(spacy, library, python, nltk) print(spacy.similarity(library)) # result is 0.0, i don't know why.. print(python.similarity(nltk)) print("??? noun_chunking", "?" * 100, "\n") | cs |
반응형