Index: head/textproc/libtextcat/Makefile =================================================================== --- head/textproc/libtextcat/Makefile (revision 462572) +++ head/textproc/libtextcat/Makefile (revision 462573) @@ -1,41 +1,43 @@ # Created by: thierry@pompo.net # $FreeBSD$ PORTNAME= libtextcat PORTVERSION= 2.2 PORTREVISION= 6 CATEGORIES= textproc MASTER_SITES= http://software.wise-guys.nl/download/ MAINTAINER= thierry@FreeBSD.org COMMENT= Language guessing by N-Gram-Based Text Categorization LICENSE= BSD3CLAUSE LICENSE_FILE= ${WRKSRC}/LICENSE GNU_CONFIGURE= yes USES= libtool USE_LDCONFIG= yes OPTIONS_DEFINE= DOCS PORTDOCS= README TODO post-install: ${INSTALL_DATA} ${WRKSRC}/src/textcat.h ${STAGEDIR}${PREFIX}/include/ ${MKDIR} ${STAGEDIR}${DATADIR}/LM @${ECHO_MSG} "Installing language models provided in Gertjan van Noord's TextCat package" (cd ${WRKSRC}/langclass/LM && \ ${FIND} . -name "*.lm" -exec ${INSTALL_DATA} {} "${STAGEDIR}${DATADIR}/LM/{}" \;) ${INSTALL_DATA} ${WRKSRC}/langclass/conf.txt "${STAGEDIR}${DATADIR}" - ${MKDIR} ${STAGEDIR}${DOCSDIR} + +post-install-DOCS-on: + @${MKDIR} ${STAGEDIR}${DOCSDIR} ${INSTALL_DATA} ${PORTDOCS:S|^|${WRKSRC}/|} ${STAGEDIR}${DOCSDIR} -regression-test: +do-test: (cd ${WRKSRC}/langclass/ && \ for t in `${LS} ShortTexts/*.txt` ; do \ ${ECHO_MSG} "Analyzing $$t..." ; \ ../src/testtextcat conf.txt < $$t ; \ done) .include Index: head/textproc/libtextcat/pkg-descr =================================================================== --- head/textproc/libtextcat/pkg-descr (revision 462572) +++ head/textproc/libtextcat/pkg-descr (revision 462573) @@ -1,17 +1,17 @@ Libtextcat is a library with functions that implement the classification technique described in Cavnar & Trenkle, "N-Gram-Based Text Categorization" [1]. It was primarily developed for language guessing, a task on which it is known to perform with near-perfect accuracy. The central idea of the Cavnar & Trenkle technique is to calculate a "fingerprint" of a document with an unknown category, and compare this with the fingerprints of a number of documents of which the categories are known. The categories of the closest matches are output as the classification. A fingerprint is a list of the most frequent n-grams occurring in a document, ordered by frequency. Fingerprints are compared with a simple out-of-place metric. [1] The document that started it all: William B. Cavnar & John M. Trenkle (1994) N-Gram-Based Text Categorization, . -WWW: http://software.wise-guys.nl/libtextcat/ +WWW: https://software.wise-guys.nl/libtextcat/