|
Book ingesting script update
- ingpiubook.sh
#!/bin/bash
FEDORA_HOME="/usr/local/fedora"
export FEDORA_HOME
PATH=$PATH:$FEDORA_HOME/server/bin:$FEDORA_HOME/client/bin:$JAVA_HOME/bin:/bin:/usr/bin:/sbin:/usr/sbin
export PATH
#Parameters
#1 directory of book directories
#e.g. IMPORT_BASE_DIR="/srv/storage/scansioni/cognetti_collezioni/direttori/"
IMPORT_BASE_DIR=$1
#2 Collection PID
#e.g. IMPORT_COLLECTION="openbess:cognetti-C001"
IMPORT_COLLECTION=$2
#3 fedoraAdmin password or test
#4 number of pages to ingest or 0 for all
n=0
for ndir in $(find "$IMPORT_BASE_DIR"* -maxdepth 0 -type d);
do
if [[ "$3" != test ]]
then
sleep 60
else
echo "NO wait"
fi
echo "=====> $nomedir"
./ingbookepages.sh "$ndir" $2 $3 $4
done
- ingbookepages.sh.sh
#!/bin/bash
FEDORA_HOME="/usr/local/fedora"
export FEDORA_HOME
PATH=$PATH:$FEDORA_HOME/server/bin:$FEDORA_HOME/client/bin:$JAVA_HOME/bin:/bin:/usr/bin:/sbin:/usr/sbin
export PATH
#parameters
#1 book directory
#e.g. IMPORT_BASE_DIR="/srv/storage/scansioni/archivio/fgramsci_TO024-00001"
IMPORT_BASE_DIR=$1
#2 Collection
#e.g. COLLEZIONE="openbess:cognetti-C001"
COLLEZIONE=$2
#3 fedoraAdmin password or test
#4 number of pages to ingest or 0 for all
IMPORT_VOL_DIR=${IMPORT_BASE_DIR##*/}
#### pre-ingesting check ####
#############################
bookPID=${IMPORT_VOL_DIR/"_"/":"}
imagedir="$IMPORT_BASE_DIR""/"
echo "============================================================================="
echo "Book PID: $bookPID"
echo "dir: $imagedir"
echo "-----------------------------------------------------------------------------"
#check files TIFF
SAVEIFS=$IFS
IFS=$(echo -en "\n\b")
ntiff=0
for nfile in $(find "$imagedir"*.tif -type f);
do
let "ntiff += 1"
sn=$(printf "%04d" $ntiff)
if [[ "$nfile" != *$sn.tif ]]
then
echo "ERROR file $nfile non corrisponde a $sn"
exit
fi
done
IFS=$SAVEIFS
#check only one OCR directory
n=0
for ndir in $(find "$imagedir"* -type d);
do
let "n += 1"
done
if [ $n -gt 1 ] || [ $n -lt 1 ]
then
echo "ERROR dir TXT non unica"
exit
fi
ocrdir=$(find "$imagedir"* -type d)
#check files TXT
ntxt=0
SAVEIFS=$IFS
IFS=$(echo -en "\n\b")
for nfile in $(find "$ocrdir"/*.txt -type f);
do
let "ntxt += 1"
st=$(printf "%04d" $ntxt)
if [[ "$nfile" != *$st.txt ]]
then
echo "ERROR file $nfile non corrisponde a $st"
exit
fi
done
IFS=$SAVEIFS
#check same number of files TIFF and TXT
if [ $ntiff -ne $ntxt ]
then
echo "ERROR files $ntiff TIFF non corrisponde a $ntxt TXT"
exit
fi
#check PDF
n=0
SAVEIFS=$IFS
IFS=$(echo -en "\n\b")
for nfile in $(find "$imagedir"*.pdf -type f);
do
let "n += 1"
filepdf="$nfile"
done
if [ $n -gt 1 ] || [ $n -lt 1 ]
then
echo "ERROR file PDF non unico"
exit
fi
IFS=$SAVEIFS
cp $filepdf "/srv/temp/pdf.pdf"
filepdf="/srv/temp/pdf.pdf"
#check DC and index
n=0
SAVEIFS=$IFS
IFS=$(echo -en "\n\b")
for nfile in $(find "$imagedir"*.txt -type f);
do
let "n += 1"
filedcindice="$nfile"
done
if [ $n -gt 1 ] || [ $n -lt 1 ]
then
echo "ERROR file DC e Indice non unico"
exit
fi
IFS=$SAVEIFS
#get DC and index
cp "$filedcindice" "/srv/temp/fdci.dci"
chmod +w "/srv/temp/fdci.dci"
#from dos to unix
fromdos "/srv/temp/fdci.dci"
filedcindice="/srv/temp/fdci.dci"
if [ ! -e $filedcindice ] || [ ! -f $filedcindice ] || [ ! -s $filedcindice ]
then
if [[ "$3" != test ]]
then
exit
else
echo "ERROR file $filedcindice"
fi
fi
#check utf-8 or us-ascii
filetype=$(file -bi $filedcindice)
if [[ "$filetype" != *utf-8 ]] && [[ "$filetype" != *us-ascii ]]
then
if [[ "$3" != test ]]
then
exit
else
echo "ERROR file $filedcindice $filetype"
fi
fi
#cut 3 special chars from begin
awk '{if(NR==1)sub(/^\xef\xbb\xbf/,"");print}' "/srv/temp/fdci.dci" > "/srv/temp/fdci.idx"
rm "/srv/temp/fdci.dci"
filedcindice="/srv/temp/fdci.idx"
DCN=( [DCTITLE] [DCCREATOR] [DCSUBJECT] [DCDESCRIPTION] [DCPUBLISHER] [DCDATE] [DCTYPE] [DCFORMAT] [DCLANGUAGE] )
DCV[0]=""
linind=""
ndc=0
nind=0
while read line; do
lline=${#line}
if [ $lline -gt 0 ]
then
dcvar="${line%%:*}"
dcval="${line#*:}"
pc=${line:0:1}
if [[ "$pc" < "0" ]] || [[ "$pc" > "9" ]]
then
let "ndc += 1"
case $dcvar in
TITLE)
DCV[0]="$dcval"
;;
CREATOR)
DCV[1]="$dcval"
;;
SUBJECT)
DCV[2]="$dcval"
;;
DESCRIPTION)
DCV[3]="$dcval"
;;
PUBLISHER)
DCV[4]="$dcval"
;;
DATE)
case $dcval in
[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9])
DCV[5]="$dcval"
;;
[0-9][0-9][0-9][0-9]-[0-9][0-9])
DCV[5]="$dcval"
;;
[0-9][0-9][0-9][0-9])
DCV[5]="$dcval"
;;
*)
echo "ERRORE Formato data: $dcval"
exit
;;
esac
;;
TYPE)
DCV[6]="$dcval"
;;
FORMAT)
DCV[7]="$dcval"
;;
LANGUAGE)
DCV[8]="$dcval"
;;
*)
echo "ERRORE metadati DC: $dcvar val: $dcval"
exit
;;
esac
fi
fi
done < "$filedcindice"
#check DC TITLE
ldctitle=${#DCV[0]}
if [ $ldctitle -gt 0 ]
then
linkind="$bookPID|${DCV[0]}"
else
echo "ERRORE Manca DC TITLE"
exit
fi
#display DC
for i in {0..8}
do
echo "${DCN[$i]} = ${DCV[$i]}"
done
#check index
while read line; do
lline=${#line}
if [ $lline -gt 0 ]
then
dcvar="${line%%:*}"
dcval="${line#*:}"
pc=${line:0:1}
if !([[ "$pc" < "0" ]] || [[ "$pc" > "9" ]])
then
case $dcvar in
[0-9][0-9][0-9][0-9])
echo "$dcval"" pag.""$dcvar"
let "nind += 1"
linkind="$linkind""||$bookPID-$dcvar|$dcval"
;;
*)
echo "ERRORE indice: $dcval pag. $dcvar"
exit
;;
esac
fi
fi
done < "$filedcindice"
rm "$filedcindice"
#### book ingesting ####
########################
#create datastream INDEX file
fileindex="/srv/temp/idx.idx"
if [ -e $fileindex ]; then
rm $fileindex
fi
echo "$linkind" > $fileindex
#create book thumbnail image
IFS=$(echo -en "\n\b")
filetiff=$(find "$imagedir"*0001.tif -type f)
IFS=$SAVEIFS
if [ ! -e $filetiff ];
then
echo "ERRORE file TIFF 0001 per TN $filetiff"
exit
fi
#uncompress
tiffcp -c none "$filetiff" "/srv/temp/tiff.tmp"
#NOT gray scale
cp "/srv/temp/tiff.tmp" "/srv/temp/tiff.tif"
rm "/srv/temp/tiff.tmp"
#create thumbnail
filetn="/srv/temp/tnbook.jpg"
risconvert=$(convert "/srv/temp/tiff.tif"[0] -thumbnail 200x225 -fuzz 1% +repage -gravity center -format jpg -quality 100 "$filetn")
if [ ! -e $filetn ] || [ ! -f $filetn ] || [ ! -s $filetn ]
then
echo "ERROR file $filetn : $risconvert"
exit
fi
#convert to RGB (IE8 compatibility)
risconvert=$(convert "$filetn" -colorspace RGB "$filetn")
if [ ! -e $filetn ] || [ ! -f $filetn ] || [ ! -s $filetn ]
then
echo "ERROR file $filetn : $risconvert"
exit
fi
#prepare template
cp /home/giancarlo/clineFC/import/templateBookMaster.xml /home/giancarlo/clineFC/import/templateBook.xml
pidlabel=${DCV[0]}
pidlabel=${DCV[0]//"/"/"\/"}
pidlabel=${pidlabel//"&"/"\&"}
sed -i "s/\[PID\]/$bookPID/g;s/\[PIDLABEL\]/$pidlabel/g;s/\[COLLEZIONE\]/$COLLEZIONE/g" /home/giancarlo/clineFC/import/templateBook.xml
for i in {0..8}
do
var=${DCN[$i]//"["/"\["}
var=${var//"]"/"\]"}
valo=${DCV[$i]//"/"/"\/"}
valo=${valo//"&"/"\&"}
sed -i "s/$var/$valo/g" /home/giancarlo/clineFC/import/templateBook.xml
done
ftn=${filetn//"/"/"\/"}
findex=${fileindex//"/"/"\/"}
fpdf=${filepdf//"/"/"\/"}
sed -i "s/\[FILETN\]/$ftn/g;s/\[FILEINDEX\]/$findex/g;s/\[FILEPDF\]/$fpdf/g" /home/giancarlo/clineFC/import/templateBook.xml
if [[ "$3" != test ]]
then
#ingest page
risultato=$(/usr/local/fedora/client/bin/fedora-ingest.sh f /home/giancarlo/clineFC/import/templateBook.xml info:fedora/fedora-system:FOXML-1.1 fc1.to.cnr.it:8080 fedoraAdmin $3 http)
if [[ "$risultato" == Ingested* ]]
then
echo "$risultato"
else
echo "ERROR INGESTING $risultato"
exit
fi
else
#or test
echo "-> INGESTED(test)"
fi
if [[ "$3" != test ]]
then
sleep 120
else
echo "NO wait"
fi
#### pages ingesting ####
#########################
if [ $4 -gt 0 ]
then
finoa=$4
else
finoa=ntiff
fi
for (( npage=1; npage<=finoa; npage++ ))
do
#for every page
snpage=$(printf "%04d" $npage)
pagePID="$bookPID""-""$snpage"
valo=${DCV[0]//"/"/"\/"}
valo=${valo//"&"/"\&"}
pagePIDlabel="$valo"" - page ""$snpage"
IFS=$(echo -en "\n\b")
filetiff=$(find "$imagedir"*$snpage.tif -type f)
IFS=$SAVEIFS
if [ ! -e $filetiff ] || [ ! -f $filetiff ] || [ ! -s $filetiff ]
then
if [[ "$3" != test ]]
then
exit
else
echo "$snpage -> ERROR file $filetiff"
fi
fi
#uncompress
tiffcp -c none "$filetiff" "/srv/temp/tiff.tmp"
#to gray scale
convert "/srv/temp/tiff.tmp" -colorspace Gray "/srv/temp/tiff.tif"
rm "/srv/temp/tiff.tmp"
filetiff="/srv/temp/tiff.tif"
filetn="/srv/temp/page-tn.jpg"
#page thumbnail
risconvert=$(convert "$filetiff"[0] -thumbnail 200x225 -fuzz 1% +repage -gravity center -format jpg -quality 100 "$filetn")
if [ ! -e $filetn ] || [ ! -f $filetn ] || [ ! -s $filetn ]
then
if [[ "$3" != test ]]
then
exit
else
echo "$snpage -> ERROR file $filetn : $risconvert"
fi
fi
#page jpeg2k
filejp2="/srv/temp/jp2.jp2"
riscompres=$(kdu_compress -i "$filetiff" -o "$filejp2" -rate 0.5 Clayers=1 Clevels=7 "Cprecincts={256,256},{256,256},{256,256},{128,128},{128,128},{64,64},{64,64},{32,32},{16,16}" "Corder=RPCL" "ORGgen_plt=yes" "ORGtparts=R" "Cblk={32,32}" Cuse_sop=yes)
if [ ! -e $filejp2 ] || [ ! -f $filejp2 ] || [ ! -s $filejp2 ]
then
if [[ "$3" != test ]]
then
exit
else
echo "$snpage -> ERROR file $filejp2 : $riscompres"
fi
fi
#OCR file
IFS=$(echo -en "\n\b")
fileocr=$(find "$ocrdir"/*$snpage.txt -type f)
IFS=$SAVEIFS
#cut FF  oct 014 char
tr -d '\014' < "$fileocr" > "/srv/temp/ocr.tmp"
#from dos to unix
fromdos "/srv/temp/ocr.tmp"
fileocrtmp="/srv/temp/ocr.tmp"
if [ ! -e $fileocrtmp ] || [ ! -f $fileocrtmp ]
then
if [[ "$3" != test ]]
then
exit
else
echo "$snpage -> ERROR file $fileocrtmp"
fi
fi
filel=$(stat -c %s $fileocrtmp)
filenull=4
#check zero lenght
if [ ! -s $fileocrtmp ] || [ "$filel" -le "$filenull" ]
then
cp blank.txt $fileocrtmp
echo "blank file OCR"
fi
#check utf-8 or us-ascii
filetype=$(file -bi $fileocrtmp)
if [[ "$filetype" != *utf-8 ]] && [[ "$filetype" != *us-ascii ]]
then
if [[ "$3" != test ]]
then
exit
else
echo "$snpage -> ERROR file $fileocrtmp"
fi
fi
awk '{if(NR==1)sub(/^\xef\xbb\xbf/,"");print}' "/srv/temp/ocr.tmp" > "/srv/temp/ocr.ocr"
rm "/srv/temp/ocr.tmp"
fileocrtmp="/srv/temp/ocr.ocr"
#prepare template
cp /home/giancarlo/clineFC/import/templatePageMaster.xml /home/giancarlo/clineFC/import/templatePage.xml
sed -i "s/\[PID\]/$pagePID/g;s/\[PIDLABEL\]/$pagePIDlabel/g" /home/giancarlo/clineFC/import/templatePage.xml
sed -i "s/\[DCTITLE\]/$pagePIDlabel/g;s/\[PIDBOOK\]/$bookPID/g" /home/giancarlo/clineFC/import/templatePage.xml
#without DCTITLE
for i in {1..8}
do
var=${DCN[$i]//"["/"\["}
var=${var//"]"/"\]"}
valo=${DCV[$i]//"/"/"\/"}
valo=${valo//"&"/"\&"}
sed -i "s/$var/$valo/g" /home/giancarlo/clineFC/import/templatePage.xml
done
ftn=${filetn//"/"/"\/"}
fjp2=${filejp2//"/"/"\/"}
focr=${fileocrtmp//"/"/"\/"}
ftiff=${filetiff//"/"/"\/"}
sed -i "s/\[FILETN\]/$ftn/g;s/\[FILEJP2\]/$fjp2/g;s/\[FILEOCR\]/$focr/g" /home/giancarlo/clineFC/import/templatePage.xml
sed -i "s/\[FILETIFF\]/$ftiff/g" /home/giancarlo/clineFC/import/templatePage.xml
if [[ "$3" != test ]]
then
#ingest page
risultato=$(/usr/local/fedora/client/bin/fedora-ingest.sh f /home/giancarlo/clineFC/import/templatePage.xml info:fedora/fedora-system:FOXML-1.1 fc1.to.cnr.it:8080 fedoraAdmin $3 http)
if [[ "$risultato" == Ingested* ]]
then
echo "$snpage -> $risultato"
else
echo "$snpage -> ERROR INGESTING $risultato"
exit
fi
else
#or test
echo "$snpage -> INGESTED(test)"
fi
done
|
|