Исходный код lena.output.pdf_to_png
"""PDF to PNG converter."""
from __future__ import print_function
import sys
import subprocess
import os
import lena.context
import lena.flow
def _run_command(command, verbose=True, timeoutsec=60):
"""Run system shell command via *subprocess* module.
*command* is a list of strings.
"""
command_name = " ".join(command)
if verbose:
print(command_name)
# todo: if verbose is False, prohibit output
popen = subprocess.Popen(command)
# popen = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
pkwargs = {}
if sys.version_info.major > 2:
pkwargs.update({"timeout": timeoutsec})
(stdoutdata, stderrdata) = popen.communicate(pkwargs)
returncode = popen.returncode
if returncode:
# todo: think about a warning here
print("stdoutdata: ", stdoutdata)
print("stderrdata: ", stderrdata)
print("returncode: ", returncode)
[документация]class PDFToPNG(object):
"""Convert PDF to image format (by default PNG)."""
def __init__(self, format="png", overwrite=False, verbose=True,
timeoutsec=60):
"""Set output *format* (by default *png*).
If the resulting file already exists and the *pdf* is unchanged
(which is checked through *context.output.changed*), conversion
is not repeated.
To convert all pdfs to images, set *overwrite* to ``True``
(by default it is ``False``).
To disable printing messages during :meth:`run`,
set *verbose* to ``False``.
*timeoutsec* is time (in seconds) for *subprocess* timeout
(used only in Python 3). If the timeout expires,
the child process will be killed and waited for.
The :exc:`TimeoutExpired` exception will be re-raised
after the child process has terminated.
This element uses ``pdftoppm`` binary internally.
``pdftoppm`` can use other output formats,
for example *jpeg* or *tiff*.
See ``pdftoppm`` manual for more details.
"""
self._format = format
self._timeoutsec = timeoutsec
self._overwrite = overwrite
self._verbose = verbose
[документация] def run(self, flow):
"""Convert PDF files to *format*.
PDF files are recognized via *context.output.filetype*.
Their paths are assumed to be the data part of the value.
Data yielded is the resulting file name.
Context is updated with *output.filetype* set to *format*.
Other values are passed unchanged.
"""
def is_pdf(context):
"""May be passed as a parameter to the class."""
filetype = lena.context.get_recursively(context,
"output.filetype", "")
return filetype == "pdf"
for val in flow:
data, context = lena.flow.get_data_context(val)
if is_pdf(context):
outputc = context["output"]
outputc["filetype"] = "png"
pdf_name = data
data = pdf_name.replace(".pdf", "")
if not os.path.exists(data + "." + self._format)\
or self._overwrite or outputc.get("changed", False):
# pdftopng adds -00001 suffix, no way to disable that.
command = ["pdftoppm", pdf_name, data,
"-" + self._format, "-singlefile"]
_run_command(command, verbose=self._verbose,
timeoutsec=self._timeoutsec)
outputc["changed"] = True
else:
if self._verbose:
print("# file unchanged, PDFToPNG skips {}"\
.format(pdf_name))
outputc["changed"] = False
data += "." + self._format
yield (data, context)
else:
yield val