#!/bin/bash # Copyright (C) 2008-2018 Richard Kimberly Heck # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # Updates can be found at https://gitlab.com/rikiheck/text PROG=`basename $0`; function printUsage { cat </dev/null 2>/dev/null; then echo "This script needs the program $prog. It does not seem to be in your path."; exit 1; fi done # End prereq check ############################################################################ #Check if the remaining argument is a single argument... if [ "$#" -ne "1" ]; then printUsage; exit 2; fi PDFFILE="$1"; BASEFILE=$(basename "$PDFFILE"); CURDIR=$(pwd); TEMPDIR="/tmp/$PROG-$$"; IMGBASE="$TEMPDIR/$BASEFILE"; # base name for extracted images if [ ! -f "$PDFFILE" ]; then echo $PDFFILE does not exist!; exit 1; fi OPTS="-f $FIRSTPG"; if [ -n "$LASTPG" ]; then OPTS="$OPTS -l $LASTPG"; elif [ -n "$DROPLAST" ]; then P=$(pdfinfo "$PDFFILE" | grep Pages | perl -pe 'm{(\d+)}; $_=$1;'); P=$(($P - 1)); OPTS="$OPTS -l $P"; fi if [ -z "$OUTFILE" ]; then OUTFILE="${BASEFILE%.pdf}.djvu"; fi if [ -f "$OUTFILE" ]; then echo "$OUTFILE exists. Will not overwrite."; exit 1; fi # we will need an absolute path later OUTFILE=$(readlink -f "$OUTFILE"); mkdir $TEMPDIR || exit 1; echo -n "Extracting pages from PDF..."; $DEBUG $PDFIMGS $OPTS "$PDFFILE" "$IMGBASE"; pushd $TEMPDIR >/dev/null || exit 1; PAGES=$(ls -1 $BASEFILE*.$IMGEXTN 2>/dev/null); if [ -z "$DEBUG" -a -z "$PAGES" ]; then # It may be that pdfimages gave us ppm files. PAGES=$(ls -1 $BASEFILE*.ppm 2>/dev/null); if [ -z "$DEBUG" -a -z "$PAGES" ]; then echo; echo "No pages appear to have been extracted from the pdf. Aborting."; exit 1; fi IMGEXTN="ppm"; fi echo "done."; if [ -n "$DEBUG" ]; then PAGES="page1.pbm page2.pbm page3.pbm page4.pbm"; fi echo; if [ -z "$INPUTDPI" ]; then if [ -n "$DEBUG" ]; then INPUTDPI=300; else PAGE=$(ls -1 $BASEFILE*.$IMGEXTN 2>/dev/null | head -n1); INPUTY=$(identify $PAGE | perl -pe 'm/\d+x(\d+)/; $_=$1;'); if [ -z "$INPUTY" ]; then echo "Can't find file input size for $BASEFILE.$IMGEXTN!"; exit 1; fi if [ "$INPUTY" -le 3000 ]; then INPUTDPI=300; else INPUTDPI=600; fi echo "Guessing DPI of $INPUTDPI dpi for page of $INPUTY pixels."; echo "Does that seem correct?"; select speed in Yes No; do if [ "$speed" = "No" ]; then display $PAGE & echo -n "Enter DPI: "; read INPUTDPI DUMMY; echo "DPI set to $INPUTDPI dpi."; break; else echo "Good!"; break; fi done fi fi if [ -n "$VERBOSE" ]; then echo "PDF Pages: "; echo $PAGES; fi echo; echo -n "Converting to TIFF..."; TIFFS=""; for i in $PAGES; do TIFFS="$TIFFS $i.tiff"; # If the file is really large, we could actually run # out of space! $DEBUG convert $INVERT "$i" "$i.tiff" && rm "$i"; done echo "done."; [ -e tiffs/ ] || mkdir tiffs/ || exit 1; if [ -n "$USESCANTAILOR" ]; then echo -n "Running ScanTailor..."; if [ -n "$DEBUG" ]; then echo; fi $DEBUG scantailor-cli --deskew=auto --margins=5 --despeckle=normal --content-detection=normal -o="$PDFFILE.ScanTailor" $THRESHOLD --dpi=$INPUTDPI --output-dpi=300 $TIFFS tiffs echo "done."; if [ -n "$RUNSCANTAILOR" ]; then scantailor "$PDFFILE.ScanTailor"; fi else for FIL in $TIFFS; do echo $FIL; cp "$FIL" tiffs/$FIL.tif; done ls tiffs/; fi pushd tiffs/ >/dev/null $DEBUG tiff2djvu -o "tiff2djvu-$$.djvu" $KEEPFILES $TWOPAGE $OCR *.tif $DEBUG cp "tiff2djvu-$$.djvu" "$OUTFILE"; popd >/dev/null # Remove any temporary files we created, unless asked not to do so. if [ -z "$DEBUG" ] && [ -z "$KEEPFILES" ]; then rm -Rf $TEMPDIR; fi