Book ingesting script update

  • Multiple book ingesting script (same collection)
ingpiubook.sh
#!/bin/bash
 
FEDORA_HOME="/usr/local/fedora"
export FEDORA_HOME
 
PATH=$PATH:$FEDORA_HOME/server/bin:$FEDORA_HOME/client/bin:$JAVA_HOME/bin:/bin:/usr/bin:/sbin:/usr/sbin
export PATH
 
#Parameters
 
#1 directory of book directories
#e.g. IMPORT_BASE_DIR="/srv/storage/scansioni/cognetti_collezioni/direttori/"
IMPORT_BASE_DIR=$1
 
#2 Collection PID
#e.g. IMPORT_COLLECTION="openbess:cognetti-C001"
IMPORT_COLLECTION=$2
 
#3 fedoraAdmin password or test
 
#4 number of pages to ingest or 0 for all
 
n=0
for ndir in $(find "$IMPORT_BASE_DIR"* -maxdepth 0 -type d);
do
   if [[ "$3" != test ]]
   then
      sleep 60
   else
      echo "NO wait"
   fi
   echo "=====> $nomedir"
   ./ingbookepages.sh "$ndir" $2 $3 $4
done
  • Single book ingesting script
ingbookepages.sh.sh
#!/bin/bash
 
FEDORA_HOME="/usr/local/fedora"
export FEDORA_HOME
 
PATH=$PATH:$FEDORA_HOME/server/bin:$FEDORA_HOME/client/bin:$JAVA_HOME/bin:/bin:/usr/bin:/sbin:/usr/sbin
export PATH
 
#parameters
 
#1 book directory
#e.g. IMPORT_BASE_DIR="/srv/storage/scansioni/archivio/fgramsci_TO024-00001"
IMPORT_BASE_DIR=$1
 
#2 Collection
#e.g. COLLEZIONE="openbess:cognetti-C001"
COLLEZIONE=$2
 
#3 fedoraAdmin password or test
 
#4 number of pages to ingest or 0 for all
 
IMPORT_VOL_DIR=${IMPORT_BASE_DIR##*/}
 
#### pre-ingesting check ####
#############################
 
bookPID=${IMPORT_VOL_DIR/"_"/":"}
imagedir="$IMPORT_BASE_DIR""/"
echo "============================================================================="
echo "Book PID: $bookPID"
echo "dir: $imagedir"
echo "-----------------------------------------------------------------------------"
 
#check files TIFF
 
SAVEIFS=$IFS
IFS=$(echo -en "\n\b")
ntiff=0
for nfile in $(find "$imagedir"*.tif -type f);
do
   let "ntiff += 1"
   sn=$(printf "%04d" $ntiff)
   if [[ "$nfile" != *$sn.tif ]]
   then
      echo "ERROR file $nfile non corrisponde a $sn"
      exit
   fi
done
IFS=$SAVEIFS
 
#check only one OCR directory
 
n=0
for ndir in $(find "$imagedir"* -type d);
do
   let "n += 1"
done
 
if [ $n -gt 1 ] || [ $n -lt 1 ]
then
   echo "ERROR dir TXT non unica"
   exit
fi
 
ocrdir=$(find "$imagedir"* -type d)
 
#check files TXT
 
ntxt=0
SAVEIFS=$IFS
IFS=$(echo -en "\n\b")
for nfile in $(find "$ocrdir"/*.txt -type f);
do
   let "ntxt += 1"
   st=$(printf "%04d" $ntxt)
   if [[ "$nfile" != *$st.txt ]]
   then
      echo "ERROR file $nfile non corrisponde a $st"
      exit
   fi
done
IFS=$SAVEIFS
 
#check same number of files TIFF and TXT
 
if [ $ntiff -ne $ntxt ]
then
   echo "ERROR files $ntiff TIFF non corrisponde a $ntxt TXT"
   exit
fi
 
#check PDF
 
n=0
SAVEIFS=$IFS
IFS=$(echo -en "\n\b")
for nfile in $(find "$imagedir"*.pdf -type f);
do
   let "n += 1"
   filepdf="$nfile"
done
if [ $n -gt 1 ] || [ $n -lt 1 ]
then
   echo "ERROR file PDF non unico"
   exit
fi
IFS=$SAVEIFS
 
cp $filepdf "/srv/temp/pdf.pdf"
filepdf="/srv/temp/pdf.pdf"
 
#check DC and index
 
n=0
SAVEIFS=$IFS
IFS=$(echo -en "\n\b")
for nfile in $(find "$imagedir"*.txt -type f);
do
   let "n += 1"
   filedcindice="$nfile"
done
if [ $n -gt 1 ] || [ $n -lt 1 ]
then
   echo "ERROR file DC e Indice non unico"
   exit
fi
IFS=$SAVEIFS
 
#get DC and index
 
cp "$filedcindice" "/srv/temp/fdci.dci"
chmod +w "/srv/temp/fdci.dci"
#from dos to unix
fromdos "/srv/temp/fdci.dci"
filedcindice="/srv/temp/fdci.dci"
if [ ! -e $filedcindice ] || [ ! -f $filedcindice ] || [ ! -s $filedcindice ]
then
   if [[ "$3" != test ]]
   then
      exit
   else
      echo "ERROR file $filedcindice"
   fi
fi
 
#check utf-8 or us-ascii
filetype=$(file -bi $filedcindice)
if [[ "$filetype" != *utf-8 ]] && [[ "$filetype" != *us-ascii ]]
then
   if [[ "$3" != test ]]
   then
      exit
   else
      echo "ERROR file $filedcindice $filetype"
   fi
fi
 
#cut 3 special chars from begin
awk '{if(NR==1)sub(/^\xef\xbb\xbf/,"");print}' "/srv/temp/fdci.dci" > "/srv/temp/fdci.idx"
rm "/srv/temp/fdci.dci"
filedcindice="/srv/temp/fdci.idx"
 
DCN=( [DCTITLE] [DCCREATOR] [DCSUBJECT] [DCDESCRIPTION] [DCPUBLISHER] [DCDATE] [DCTYPE] [DCFORMAT] [DCLANGUAGE] )
DCV[0]=""
linind=""
ndc=0
nind=0
while read line; do
   lline=${#line}
   if [ $lline -gt 0 ]
   then
      dcvar="${line%%:*}"
      dcval="${line#*:}"
      pc=${line:0:1}
      if [[ "$pc" < "0" ]] || [[ "$pc" > "9" ]]
      then
         let "ndc += 1"
         case $dcvar in
            TITLE)
               DCV[0]="$dcval"
            ;;
            CREATOR)
               DCV[1]="$dcval"
            ;;
            SUBJECT)
               DCV[2]="$dcval"
            ;;
            DESCRIPTION)
               DCV[3]="$dcval"
            ;;
            PUBLISHER)
               DCV[4]="$dcval"
            ;;
            DATE)
               case $dcval in
                  [0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9])
                     DCV[5]="$dcval"
                  ;;
                  [0-9][0-9][0-9][0-9]-[0-9][0-9])
                     DCV[5]="$dcval"
                  ;;
                  [0-9][0-9][0-9][0-9])
                     DCV[5]="$dcval"
                  ;;
                  *)
                     echo "ERRORE Formato data: $dcval"
                     exit
                  ;;
               esac
            ;;
            TYPE)
               DCV[6]="$dcval"
            ;;
            FORMAT)
               DCV[7]="$dcval"
            ;;
            LANGUAGE)
               DCV[8]="$dcval"
            ;;
            *)
               echo "ERRORE metadati DC: $dcvar val: $dcval"
               exit
            ;;
         esac
      fi
   fi
done < "$filedcindice"
 
#check DC TITLE
 
ldctitle=${#DCV[0]}
if [ $ldctitle -gt 0 ]
then
   linkind="$bookPID|${DCV[0]}"
else
   echo "ERRORE Manca DC TITLE"
   exit
fi
 
#display DC
 
for i in {0..8}
do
   echo "${DCN[$i]} = ${DCV[$i]}"
done
 
#check index
 
while read line; do
   lline=${#line}
   if [ $lline -gt 0 ]
   then
      dcvar="${line%%:*}"
      dcval="${line#*:}"
      pc=${line:0:1}
      if !([[ "$pc" < "0" ]] || [[ "$pc" > "9" ]])
      then
         case $dcvar in
            [0-9][0-9][0-9][0-9])
               echo "$dcval"" pag.""$dcvar"
               let "nind += 1"
               linkind="$linkind""||$bookPID-$dcvar|$dcval"
            ;;
            *)
               echo "ERRORE indice: $dcval  pag. $dcvar"
               exit
            ;;
         esac
      fi
   fi
done < "$filedcindice"
rm "$filedcindice"
 
#### book ingesting ####
########################
 
#create datastream INDEX file
 
fileindex="/srv/temp/idx.idx"
if [ -e $fileindex ]; then
   rm $fileindex
fi
echo "$linkind" > $fileindex
 
#create book thumbnail image
 
IFS=$(echo -en "\n\b")
filetiff=$(find "$imagedir"*0001.tif -type f)
IFS=$SAVEIFS
if [ ! -e $filetiff ];
then
   echo "ERRORE file TIFF 0001 per TN $filetiff"
   exit
fi
 
#uncompress
tiffcp -c none "$filetiff" "/srv/temp/tiff.tmp"
 
#NOT gray scale
cp "/srv/temp/tiff.tmp" "/srv/temp/tiff.tif"
rm "/srv/temp/tiff.tmp"
 
#create thumbnail
filetn="/srv/temp/tnbook.jpg"
risconvert=$(convert "/srv/temp/tiff.tif"[0] -thumbnail 200x225 -fuzz 1% +repage -gravity center -format jpg -quality 100  "$filetn")
if [ ! -e $filetn ] || [ ! -f $filetn ] || [ ! -s $filetn ]
then
   echo "ERROR file $filetn : $risconvert"
   exit
fi
#convert to RGB (IE8 compatibility)
risconvert=$(convert "$filetn" -colorspace RGB "$filetn")
if [ ! -e $filetn ] || [ ! -f $filetn ] || [ ! -s $filetn ]
then
   echo "ERROR file $filetn : $risconvert"
   exit
fi
 
#prepare template
 
cp /home/giancarlo/clineFC/import/templateBookMaster.xml /home/giancarlo/clineFC/import/templateBook.xml
pidlabel=${DCV[0]}
pidlabel=${DCV[0]//"/"/"\/"}
pidlabel=${pidlabel//"&"/"\&amp;"}
sed -i "s/\[PID\]/$bookPID/g;s/\[PIDLABEL\]/$pidlabel/g;s/\[COLLEZIONE\]/$COLLEZIONE/g" /home/giancarlo/clineFC/import/templateBook.xml
 
for i in {0..8}
do
   var=${DCN[$i]//"["/"\["}
   var=${var//"]"/"\]"}
   valo=${DCV[$i]//"/"/"\/"}
   valo=${valo//"&"/"\&amp;"}
   sed -i "s/$var/$valo/g" /home/giancarlo/clineFC/import/templateBook.xml
done
ftn=${filetn//"/"/"\/"}
findex=${fileindex//"/"/"\/"}
fpdf=${filepdf//"/"/"\/"}
sed -i "s/\[FILETN\]/$ftn/g;s/\[FILEINDEX\]/$findex/g;s/\[FILEPDF\]/$fpdf/g" /home/giancarlo/clineFC/import/templateBook.xml
 
if [[ "$3" != test ]]
then
 
   #ingest page
   risultato=$(/usr/local/fedora/client/bin/fedora-ingest.sh f /home/giancarlo/clineFC/import/templateBook.xml info:fedora/fedora-system:FOXML-1.1 fc1.to.cnr.it:8080 fedoraAdmin $3 http)
   if [[ "$risultato" == Ingested* ]]
   then
      echo "$risultato"
   else
      echo "ERROR INGESTING $risultato"
      exit
   fi
else
 
   #or test
   echo "-> INGESTED(test)"
fi
 
if [[ "$3" != test ]]
then
   sleep 120
else
   echo "NO wait"
fi
 
#### pages ingesting ####
#########################
 
if [ $4 -gt 0 ]
then
   finoa=$4
else
   finoa=ntiff
fi
 
for (( npage=1; npage<=finoa; npage++ ))
do
 
   #for every page
 
   snpage=$(printf "%04d" $npage)
   pagePID="$bookPID""-""$snpage"
   valo=${DCV[0]//"/"/"\/"}
   valo=${valo//"&"/"\&amp;"}
   pagePIDlabel="$valo"" - page ""$snpage"
   IFS=$(echo -en "\n\b")
   filetiff=$(find "$imagedir"*$snpage.tif -type f)
   IFS=$SAVEIFS
   if [ ! -e $filetiff ] || [ ! -f $filetiff ] || [ ! -s $filetiff ]
   then
      if [[ "$3" != test ]]
      then
         exit
      else
         echo "$snpage -> ERROR file $filetiff"
      fi
   fi
 
   #uncompress
   tiffcp -c none "$filetiff" "/srv/temp/tiff.tmp"
   #to gray scale
   convert "/srv/temp/tiff.tmp" -colorspace Gray "/srv/temp/tiff.tif"
   rm "/srv/temp/tiff.tmp"
   filetiff="/srv/temp/tiff.tif"
   filetn="/srv/temp/page-tn.jpg"
   #page thumbnail
   risconvert=$(convert "$filetiff"[0] -thumbnail 200x225 -fuzz 1% +repage -gravity center -format jpg -quality 100  "$filetn")
   if [ ! -e $filetn ] || [ ! -f $filetn ] || [ ! -s $filetn ]
   then
      if [[ "$3" != test ]]
      then
         exit
      else
         echo "$snpage -> ERROR file $filetn : $risconvert"
      fi
   fi
 
   #page jpeg2k
   filejp2="/srv/temp/jp2.jp2"
   riscompres=$(kdu_compress -i "$filetiff" -o "$filejp2" -rate 0.5 Clayers=1 Clevels=7 "Cprecincts={256,256},{256,256},{256,256},{128,128},{128,128},{64,64},{64,64},{32,32},{16,16}" "Corder=RPCL" "ORGgen_plt=yes" "ORGtparts=R" "Cblk={32,32}" Cuse_sop=yes)
   if [ ! -e $filejp2 ] || [ ! -f $filejp2 ] || [ ! -s $filejp2 ]
   then
      if [[ "$3" != test ]]
      then
         exit
      else
         echo "$snpage -> ERROR file $filejp2 : $riscompres"
      fi
   fi
 
   #OCR file
   IFS=$(echo -en "\n\b")
   fileocr=$(find "$ocrdir"/*$snpage.txt -type f)
   IFS=$SAVEIFS
   #cut FF &#12 oct 014 char
   tr -d '\014' < "$fileocr" > "/srv/temp/ocr.tmp"
   #from dos to unix
   fromdos "/srv/temp/ocr.tmp"
   fileocrtmp="/srv/temp/ocr.tmp"
   if [ ! -e $fileocrtmp ] || [ ! -f $fileocrtmp ]
   then
      if [[ "$3" != test ]]
      then
         exit
      else
         echo "$snpage -> ERROR file $fileocrtmp"
      fi
   fi
   filel=$(stat -c %s $fileocrtmp)
   filenull=4
   #check zero lenght
   if [ ! -s $fileocrtmp ] ||  [ "$filel" -le "$filenull" ]
   then
      cp blank.txt $fileocrtmp
      echo "blank file OCR"
   fi
   #check utf-8 or us-ascii
   filetype=$(file -bi $fileocrtmp)
   if [[ "$filetype" != *utf-8 ]] && [[ "$filetype" != *us-ascii ]]
   then
      if [[ "$3" != test ]]
      then
         exit
      else
         echo "$snpage -> ERROR file $fileocrtmp"
      fi
   fi
   awk '{if(NR==1)sub(/^\xef\xbb\xbf/,"");print}' "/srv/temp/ocr.tmp" > "/srv/temp/ocr.ocr"
   rm "/srv/temp/ocr.tmp"
   fileocrtmp="/srv/temp/ocr.ocr"
 
   #prepare template
   cp /home/giancarlo/clineFC/import/templatePageMaster.xml /home/giancarlo/clineFC/import/templatePage.xml
   sed -i "s/\[PID\]/$pagePID/g;s/\[PIDLABEL\]/$pagePIDlabel/g" /home/giancarlo/clineFC/import/templatePage.xml
   sed -i "s/\[DCTITLE\]/$pagePIDlabel/g;s/\[PIDBOOK\]/$bookPID/g" /home/giancarlo/clineFC/import/templatePage.xml
 
   #without DCTITLE
   for i in {1..8}
   do
      var=${DCN[$i]//"["/"\["}
      var=${var//"]"/"\]"}
      valo=${DCV[$i]//"/"/"\/"}
      valo=${valo//"&"/"\&amp;"}
      sed -i "s/$var/$valo/g" /home/giancarlo/clineFC/import/templatePage.xml
   done
 
   ftn=${filetn//"/"/"\/"}
   fjp2=${filejp2//"/"/"\/"}
   focr=${fileocrtmp//"/"/"\/"}
   ftiff=${filetiff//"/"/"\/"}
   sed -i "s/\[FILETN\]/$ftn/g;s/\[FILEJP2\]/$fjp2/g;s/\[FILEOCR\]/$focr/g" /home/giancarlo/clineFC/import/templatePage.xml
   sed -i "s/\[FILETIFF\]/$ftiff/g" /home/giancarlo/clineFC/import/templatePage.xml
 
   if [[ "$3" != test ]]
   then
 
      #ingest page
      risultato=$(/usr/local/fedora/client/bin/fedora-ingest.sh f /home/giancarlo/clineFC/import/templatePage.xml info:fedora/fedora-system:FOXML-1.1 fc1.to.cnr.it:8080 fedoraAdmin $3 http)
      if [[ "$risultato" == Ingested* ]]
      then
         echo "$snpage -> $risultato"
      else
         echo "$snpage -> ERROR INGESTING $risultato"
         exit
      fi
   else
 
      #or test
      echo "$snpage -> INGESTED(test)"
   fi
done
 
 
ingesting/ingbscript.txt ยท Last modified: 2012/11/27 11:02 by giancarlo

Developers: CNR IRCrES IT Office and Library
Giancarlo Birello (giancarlo.birello _@_ ircres.cnr.it) and Anna Perin (anna.perin _@_ ircres.cnr.it)
DigiBess is licensed under: Creative Commons License
Recent changes RSS feed Creative Commons License Valid XHTML 1.0 Valid CSS Driven by DokuWiki
Drupal Garland Theme for Dokuwiki