Index: textproc/Makefile =================================================================== --- textproc/Makefile +++ textproc/Makefile @@ -1302,6 +1302,7 @@ SUBDIR += py-nltk SUBDIR += py-normality SUBDIR += py-numpydoc + SUBDIR += py-ocrmypdf SUBDIR += py-openpyxl SUBDIR += py-openpyxl24 SUBDIR += py-openstackdocstheme Index: textproc/py-ocrmypdf/Makefile =================================================================== --- /dev/null +++ textproc/py-ocrmypdf/Makefile @@ -0,0 +1,71 @@ +# $FreeBSD$ + +PORTNAME= ocrmypdf +DISTVERSION= 8.3.1 +CATEGORIES= textproc python +MASTER_SITES= CHEESESHOP +PKGNAMEPREFIX= ${PYTHON_PKGNAMEPREFIX} + +MAINTAINER= kai@FreeBSD.org +COMMENT= Adds an OCR test layer to scanned PDF files + +LICENSE= GPLv3+ +LICENSE_FILE= ${WRKSRC}/LICENSE + +BUILD_DEPENDS= ${PYTHON_PKGNAMEPREFIX}cffi>=1.9.1:devel/py-cffi@${PY_FLAVOR} \ + ${PYTHON_PKGNAMEPREFIX}pytest-runner>=0:devel/py-pytest-runner@${PY_FLAVOR} \ + ${PYTHON_PKGNAMEPREFIX}setuptools_scm>=0:devel/py-setuptools_scm@${PY_FLAVOR} \ + ${PYTHON_PKGNAMEPREFIX}setuptools_scm_git_archive>=0:devel/py-setuptools_scm_git_archive@${PY_FLAVOR} +LIB_DEPENDS= liblept.so:graphics/leptonica +RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}cffi>=1.9.1:devel/py-cffi@${PY_FLAVOR} \ + ${PYTHON_PKGNAMEPREFIX}img2pdf>=0.3.0,<0.4:graphics/py-img2pdf@${PY_FLAVOR} \ + ${PYTHON_PKGNAMEPREFIX}pillow>=4.0.0:graphics/py-pillow@${PY_FLAVOR} \ + ${PYTHON_PKGNAMEPREFIX}reportlab>=3.3.0:print/py-reportlab@${PY_FLAVOR} \ + ${PYTHON_PKGNAMEPREFIX}ruffus>=2.7.0:science/py-ruffus@${PY_FLAVOR} \ + ${PYTHON_PKGNAMEPREFIX}chardet>=3.0.4,<4:textproc/py-chardet@${PY_FLAVOR} \ + ${PYTHON_PKGNAMEPREFIX}pdfminer.six>=20181108:textproc/py-pdfminer.six@${PY_FLAVOR} \ + ${PYTHON_PKGNAMEPREFIX}pikepdf>=1.3.0,<2:textproc/py-pikepdf@${PY_FLAVOR} \ + pngquant:graphics/pngquant \ + tesseract:graphics/tesseract +TEST_DEPENDS= ${PYTHON_PKGNAMEPREFIX}pytest>=4.4.1,<5:devel/py-pytest@${PY_FLAVOR} \ + ${PYTHON_PKGNAMEPREFIX}pytest-helpers-namespace>=2019.1.8:devel/py-pytest-helpers-namespace@${PY_FLAVOR} \ + ${PYTHON_PKGNAMEPREFIX}pytest-xdist>=1.28.0:devel/py-pytest-xdist@${PY_FLAVOR} \ + ${PYTHON_PKGNAMEPREFIX}pytest-cov>=2.6.1:devel/py-pytest-cov@${PY_FLAVOR} \ + ${PYTHON_PKGNAMEPREFIX}pdf2>=1.26.0:print/py-pdf2@${PY_FLAVOR} \ + ${PYTHON_PKGNAMEPREFIX}python-xmp-toolkit>=0:textproc/py-python-xmp-toolkit@${PY_FLAVOR} + +USES= ghostscript:run python:3.6+ shebangfix +USE_PYTHON= autoplist concurrent distutils + +SHEBANG_FILES= src/ocrmypdf/__main__.py \ + src/ocrmypdf/hocrtransform.py \ + src/ocrmypdf/leptonica.py \ + src/ocrmypdf/pdfinfo/__init__.py \ + tests/spoof/gs_feature_elision.py \ + tests/spoof/gs_pdfa_failure.py \ + tests/spoof/tesseract_badutf8.py \ + tests/spoof/tesseract_big_image_error.py \ + tests/spoof/tesseract_cache.py \ + tests/spoof/tesseract_noop.py \ + tests/spoof/unpaper_oldversion.py \ + tests/spoof/gs_render_failure.py \ + tests/spoof/gs_raster_failure.py \ + tests/spoof/tesseract_crash.py + +MAKE_ENV= LC_ALL=en_US.UTF-8 + +NO_ARCH= yes + +# This workaround is required as the files aren't added to .PLIST.pymodtmp +# during the installation process. +post-stage: + ${ECHO} "${PYTHONPREFIX_SITELIBDIR}/ocrmypdf/lib/__pycache__/_leptonica.cpython-${PYTHON_SUFFIX}.opt-1.pyc" >> ${WRKDIR}/.PLIST.pymodtmp + ${ECHO} "${PYTHONPREFIX_SITELIBDIR}/ocrmypdf/lib/__pycache__/_leptonica.cpython-${PYTHON_SUFFIX}.pyc" >> ${WRKDIR}/.PLIST.pymodtmp + ${ECHO} "${PYTHONPREFIX_SITELIBDIR}/ocrmypdf/lib/_leptonica.py" >> ${WRKDIR}/.PLIST.pymodtmp + +do-test: + @cd ${WRKSRC} && ${SETENV} LC_ALL=en_US.UTF-8 ${PYTHON_CMD} -m pytest -n ${MAKE_JOBS_NUMBER} -v -k ' \ + not test_dev_null and \ + not test_mono_not_inverted' + +.include Index: textproc/py-ocrmypdf/distinfo =================================================================== --- /dev/null +++ textproc/py-ocrmypdf/distinfo @@ -0,0 +1,3 @@ +TIMESTAMP = 1562856619 +SHA256 (ocrmypdf-8.3.1.tar.gz) = e9f87e777c2a4ea924e74d3db02792ca5f8c06ad73f5235fad3c49626e40f14e +SIZE (ocrmypdf-8.3.1.tar.gz) = 7560708 Index: textproc/py-ocrmypdf/files/patch-setup.cfg =================================================================== --- /dev/null +++ textproc/py-ocrmypdf/files/patch-setup.cfg @@ -0,0 +1,10 @@ +--- setup.cfg.orig 2019-07-04 07:16:53 UTC ++++ setup.cfg +@@ -9,6 +9,7 @@ ignore = + .github + + [tool:pytest] ++markers = slow + norecursedirs = lib .pc .git output cache resources + testpaths = tests + filterwarnings = Index: textproc/py-ocrmypdf/pkg-descr =================================================================== --- /dev/null +++ textproc/py-ocrmypdf/pkg-descr @@ -0,0 +1,19 @@ +OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be +searched or copy-pasted. + +Main features: + +* Generates a searchable PDF/A file from a regular PDF +* Places OCR text accurately below the image to ease copy / paste +* Keeps the exact resolution of the original embedded images +* When possible, inserts OCR information as a "lossless" operation without + disrupting any other content +* Optimizes PDF images, often producing files smaller than the input file +* If requested deskews and/or cleans the image before performing OCR +* Validates input and output files +* Distributes work across all available CPU cores +* Uses Tesseract OCR engine to recognize more than 100 languages +* Scales properly to handle files with thousands of pages +* Battle-tested on millions of PDFs + +WWW: https://github.com/jbarlow83/OCRmyPDF