Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Import NextFile and OutputSplitter from WikiExtractor #234

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 2 additions & 66 deletions wikiextractor/cirrus-extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,79 +42,15 @@
import gzip
import logging

from .WikiExtractor import NextFile, OutputSplitter

# Program version
version = '3.0'

urlbase = 'http://it.wikipedia.org/'

# ----------------------------------------------------------------------

class NextFile(object):
"""
Synchronous generation of next available file name.
"""

filesPerDir = 100

def __init__(self, path_name):
self.path_name = path_name
self.dir_index = -1
self.file_index = -1

def next(self):
self.file_index = (self.file_index + 1) % NextFile.filesPerDir
if self.file_index == 0:
self.dir_index += 1
dirname = self._dirname()
if not os.path.isdir(dirname):
os.makedirs(dirname)
return self._filepath()

def _dirname(self):
char1 = self.dir_index % 26
char2 = int(self.dir_index / 26) % 26
return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))

def _filepath(self):
return '%s/wiki_%02d' % (self._dirname(), self.file_index)

class OutputSplitter(object):
"""
File-like object, that splits output to multiple files of a given max size.
"""

def __init__(self, nextFile, max_file_size=0, compress=True):
"""
:param nextfile: a NextFile object from which to obtain filenames
to use.
:param max_file_size: the maximum size of each file.
:para compress: whether to write data with bzip compression.
"""
self.nextFile = nextFile
self.compress = compress
self.max_file_size = max_file_size
self.file = self.open(self.nextFile.next())

def reserve(self, size):
if self.file.tell() + size > self.max_file_size:
self.close()
self.file = self.open(self.nextFile.next())

def write(self, data):
self.reserve(len(data))
self.file.write(data)

def close(self):
self.file.close()

def open(self, filename):
if self.compress:
return bz2.BZ2File(filename + '.bz2', 'w')
else:
return open(filename, 'w')

# ----------------------------------------------------------------------

class Extractor(object):

def extract(self, out):
Expand Down