-
Notifications
You must be signed in to change notification settings - Fork 0
/
regex.py
117 lines (78 loc) · 3.18 KB
/
regex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from io import StringIO
import re
import hashlib
import streamlit as st
completed_lines_hash = set()
out = ""
before='''WEBVTT
Kind: captions
Language: es
00:00:10.480 --> 00:00:13.680 align:start position:0%
muchas <00:00:10.964><c>gracias </c><00:00:11.448><c>y </c><00:00:11.932><c>bueno </c><00:00:12.416><c>antes </c><00:00:12.900><c>de </c><00:00:13.384><c>todo</c>
00:00:13.680 --> 00:00:13.690 align:start position:0%
muchas gracias y bueno antes de todo
00:00:13.690 --> 00:00:16.140 align:start position:0%
muchas gracias y bueno antes de todo
pues <00:00:14.079><c>agradecerte </c><00:00:14.468><c>tanto </c><00:00:14.857><c>a </c><00:00:15.246><c>ti </c><00:00:15.635><c>como </c><00:00:16.024><c>a</c>
00:00:16.140 --> 00:00:16.150 align:start position:0%
pues agradecerte tanto a ti como a'''
after= '''WEBVTT
Kind: captions
Language: es
muchas gracias y bueno antes de todo
pues agradecerte tanto a ti como a
silver en la oportunidad porque en'''
# Web App Title
st.markdown('''
# **The Regex wizard!**
Esta aplicación permite importar archivos de texto extraídos de subtítulos de videos de YouTube y realizar una limpieza para obtener finalmente un txt limpio y legible.
Se eliminan frases repedidas, saltos de línea, timestamps y caracteres especiales de separación.
Texto de ejemplo extraído directamente de los subtítulos de YouTube:
''')
st.code(before,language=None)
st.markdown(''' Después de la limpieza:
''')
st.code(after,language=None)
st.markdown('''
**Credit:** App built in `Python` + `Streamlit` by [Roberto](https://github.com/rchatru)
---
''')
# Upload CSV data
with st.sidebar.header('1. Upload your txt file'):
input = st.sidebar.file_uploader("Upload your input txt file", type=["txt"])
if input is not None:
string_in = StringIO(input.read().decode('utf-8'))
for line in string_in:
line = str(line)
line = re.sub(r'(<[^>]+>)|(0[^%]+%)|([0-9]+:[0-9]+)',"", line)
line = line.replace("\\n", "")
hashValue = hashlib.md5(line.encode('utf-8')).hexdigest()
if hashValue not in completed_lines_hash:
out += line
out += "\n"
completed_lines_hash.add(hashValue)
st.write(out)
with st.sidebar:
st.header('2. Download processed txt file')
st.download_button('Download file', out)
else:
# Text Input
# save the input text in the variable 'name'
# first argument shows the title of the text input box
# second argument displays a default text inside the text input area
text = st.text_area("Enter text for processing")
if(st.button('Submit')):
st.success('Correcto')
text = str(text)
text = re.sub(r'(<[^>]+>)|(0[^%]+%)|([0-9]+:[0-9]+)',"", text)
text = text.replace("\\n", "")
for line in text.splitlines():
hashValue = hashlib.md5(line.encode('utf-8')).hexdigest()
if hashValue not in completed_lines_hash:
out += line
out += "\n"
completed_lines_hash.add(hashValue)
st.write(out)
with st.sidebar:
st.header('2. Download processed txt file')
st.download_button('Download file', out)