반응형
/*******************************************************************************************************************
-- Title : [Py3.5] 정규 표현식을 활용한 XML 태그 제거
-- Key word : python 파이썬 정규 표현식 정규식 regular expression 정규표현식 re 태그 제거 태그제거
*******************************************************************************************************************/
-- Case 1.
1
2
3
4
|
sent = "hi-hello, how about? wops.co,ltd ttt123 456 -hahaha ya*ho_kkk"
m = re.findall(string=sent, pattern='[a-z]+[\-]?[a-z]+')
print (m)
|
cs |
-- Case 2.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
|
# -*- coding:utf-8 -*-
import re
# 프린팅 포맷
def printResult(regex, repl, text):
value = re.sub(regex, repl, text)
print ('# -- '+ regex, '\n', value, '\n', '-' * 30)
return value
def stripwopsTag(text):
value = printResult("<style[^>]*>(.*?)", "", text) # <style~~> 태그 제거
value = printResult("<(/)?claim[^>]*>", "", value) # <claim~></claim~> 태그 제거
value = printResult("<(/)?li[^>]*>", " ", value) # <li~></li~> 태그 제거
value = printResult("<(/)?LI[^>]*>", " ", value) # <LI~></LI~> 태그 제거
value = printResult("<(/)?T[^>]*>", " ", value) # <T~></T~> 태그 제거
value = printResult("<(/)?t[^>]*>", " ", value) # <t~><t~> 태그 제거
value = printResult("<(/)?sub[^>]*>", "", value) # <sub~></sub~> 태그 제거
value = printResult("<(/)?pre[^>]*>", " ", value) # <pre~></pre~> 태그 제거
# <tag attribute='~~'></tag> 속성이 있는 html 태그 전부 제거
value = printResult(
"<(/)?([a-zA-Z]*)(-[a-zA-Z]*)?(-[a-zA-Z]*)?(-[a-zA-Z]*)?(\\s[a-zA-Z]*(\\s)?(\\s)?=[^>]*)?(\\s)*(/)?>", " ",
value)
value = printResult("\\[(?i)image\\]", "", value) # [image] 문자열 제거
value = printResult("<(\\?)?([a-zA-Z]*)[^>]*\\?>", " ", value) # <?~~ ?> php 코드 제거
value = printResult("<(/)?exch:p[^>]*>", " ", value) # docdb - <excp:p ~~~> 태그 제거
value = printResult("\\[(?i)Chemical(\\s)(?i)Formula(\\s)?([0-9]{1,10})?(\\.?)?([0-9]{1,10})?\\]", "",
value) # [Chemical Formula 9132~.3904~] 문자열 제거
value = printResult("\\[(?i)Numerical(\\s)(?i)Formula(\\s)?([0-9]{1,10})?(\\.?)?([0-9]{1,10})?\\]", "",
value) # [Numerical Formula 9132~.3904~] 문자열 제거
value = printResult("\\[(?i)Table(\\s)?([0-9]{1,10})?\\]", "", value) # [Table 031041~~] 문자열 제거
value = printResult("[\n]+", " ", value) # 개행문자, 탭, 캐리지리턴 제거
value = printResult("[ \t]+", " ", value) # 개행문자, 탭, 캐리지리턴 제거
value = printResult("[\n]+", "\n", value) # 개행문자, 탭, 캐리지리턴 제거
value = printResult("(\n \\n)+", "\n", value) # 개행문자, 탭, 캐리지리턴 제거
value = printResult("[ \t]+", " ", value) # 개행문자, 탭, 캐리지리턴 제거
value = printResult("\r\n", " ", value) # 개행문자, 탭, 캐리지리턴 제거
value = printResult("\r", "", value) # 개행문자, 탭, 캐리지리턴 제거
value = printResult("<!--(\\s)?([a-zA-Z]*)?(\\s)?-->", " ", value) # <!-- EPO --> 주석제거
return value
if __name__ == '__main__':
sample = """<style type="text/css">h1{color:white;background:black;}</style>
<claim><heading id="h0001"><u style="single">INTRODUCTION</u></heading></claim>
<p id="p0003" num="0003">Researchers have been</p>
<li>(1) isolating a pluripotent</li>
<LI>(2) inserting the cell or an</LI>
(<i>i</i>.<i>e</i>., the oocyte's nucleus was previously extracted).
<p id="p0001" num="0001"><T>The invention relates</T><t>in part</t>to the cloning
<sub>[Chemical Formula 3923.2393]</sub>of animals that comprise heterologous DNA molecules.
<pre name="pretags">[image]</pre> Such transgenic animals preferably
[Table 310304012]contain at least about 100 kilobase pairs of [Numerical Formula 21943229310245020486]
skilled artisan as "artificial chromosomes."""
stripwopsTag(sample) # html 태그 제거 모듈
|
cs |
반응형