#!/bin/bash FEDORA_HOME="/usr/local/fedora" export FEDORA_HOME PATH=$PATH:$FEDORA_HOME/server/bin:$FEDORA_HOME/client/bin:$JAVA_HOME/bin:/bin:/usr/bin:/sbin:/usr/sbin export PATH #parameters #1 book directory #e.g. IMPORT_BASE_DIR="/srv/storage/scansioni/archivio/fgramsci_TO024-00001" IMPORT_BASE_DIR=$1 #2 Collection #e.g. COLLEZIONE="openbess:cognetti-C001" COLLEZIONE=$2 #3 fedoraAdmin password or test #4 number of pages to ingest or 0 for all IMPORT_VOL_DIR=${IMPORT_BASE_DIR##*/} #### pre-ingesting check #### ############################# bookPID=${IMPORT_VOL_DIR/"_"/":"} imagedir="$IMPORT_BASE_DIR""/" echo "=============================================================================" echo "Book PID: $bookPID" echo "dir: $imagedir" echo "-----------------------------------------------------------------------------" #check files TIFF SAVEIFS=$IFS IFS=$(echo -en "\n\b") ntiff=0 for nfile in $(find "$imagedir"*.tif -type f); do let "ntiff += 1" sn=$(printf "%04d" $ntiff) if [[ "$nfile" != *$sn.tif ]] then echo "ERROR file $nfile non corrisponde a $sn" exit fi done IFS=$SAVEIFS #check only one OCR directory n=0 for ndir in $(find "$imagedir"* -type d); do let "n += 1" done if [ $n -gt 1 ] || [ $n -lt 1 ] then echo "ERROR dir TXT non unica" exit fi ocrdir=$(find "$imagedir"* -type d) #check files TXT ntxt=0 SAVEIFS=$IFS IFS=$(echo -en "\n\b") for nfile in $(find "$ocrdir"/*.txt -type f); do let "ntxt += 1" st=$(printf "%04d" $ntxt) if [[ "$nfile" != *$st.txt ]] then echo "ERROR file $nfile non corrisponde a $st" exit fi done IFS=$SAVEIFS #check same number of files TIFF and TXT if [ $ntiff -ne $ntxt ] then echo "ERROR files $ntiff TIFF non corrisponde a $ntxt TXT" exit fi #check PDF n=0 SAVEIFS=$IFS IFS=$(echo -en "\n\b") for nfile in $(find "$imagedir"*.pdf -type f); do let "n += 1" filepdf="$nfile" done if [ $n -gt 1 ] || [ $n -lt 1 ] then echo "ERROR file PDF non unico" exit fi IFS=$SAVEIFS cp $filepdf "/srv/temp/pdf.pdf" filepdf="/srv/temp/pdf.pdf" #check DC and index n=0 SAVEIFS=$IFS IFS=$(echo -en "\n\b") for nfile in $(find "$imagedir"*.txt -type f); do let "n += 1" filedcindice="$nfile" done if [ $n -gt 1 ] || [ $n -lt 1 ] then echo "ERROR file DC e Indice non unico" exit fi IFS=$SAVEIFS #get DC and index cp "$filedcindice" "/srv/temp/fdci.dci" chmod +w "/srv/temp/fdci.dci" #from dos to unix fromdos "/srv/temp/fdci.dci" filedcindice="/srv/temp/fdci.dci" if [ ! -e $filedcindice ] || [ ! -f $filedcindice ] || [ ! -s $filedcindice ] then if [[ "$3" != test ]] then exit else echo "ERROR file $filedcindice" fi fi #check utf-8 or us-ascii filetype=$(file -bi $filedcindice) if [[ "$filetype" != *utf-8 ]] && [[ "$filetype" != *us-ascii ]] then if [[ "$3" != test ]] then exit else echo "ERROR file $filedcindice $filetype" fi fi #cut 3 special chars from begin awk '{if(NR==1)sub(/^\xef\xbb\xbf/,"");print}' "/srv/temp/fdci.dci" > "/srv/temp/fdci.idx" rm "/srv/temp/fdci.dci" filedcindice="/srv/temp/fdci.idx" DCN=( [DCTITLE] [DCCREATOR] [DCSUBJECT] [DCDESCRIPTION] [DCPUBLISHER] [DCDATE] [DCTYPE] [DCFORMAT] [DCLANGUAGE] ) DCV[0]="" linind="" ndc=0 nind=0 while read line; do lline=${#line} if [ $lline -gt 0 ] then dcvar="${line%%:*}" dcval="${line#*:}" pc=${line:0:1} if [[ "$pc" < "0" ]] || [[ "$pc" > "9" ]] then let "ndc += 1" case $dcvar in TITLE) DCV[0]="$dcval" ;; CREATOR) DCV[1]="$dcval" ;; SUBJECT) DCV[2]="$dcval" ;; DESCRIPTION) DCV[3]="$dcval" ;; PUBLISHER) DCV[4]="$dcval" ;; DATE) case $dcval in [0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]) DCV[5]="$dcval" ;; [0-9][0-9][0-9][0-9]-[0-9][0-9]) DCV[5]="$dcval" ;; [0-9][0-9][0-9][0-9]) DCV[5]="$dcval" ;; *) echo "ERRORE Formato data: $dcval" exit ;; esac ;; TYPE) DCV[6]="$dcval" ;; FORMAT) DCV[7]="$dcval" ;; LANGUAGE) DCV[8]="$dcval" ;; *) echo "ERRORE metadati DC: $dcvar val: $dcval" exit ;; esac fi fi done < "$filedcindice" #check DC TITLE ldctitle=${#DCV[0]} if [ $ldctitle -gt 0 ] then linkind="$bookPID|${DCV[0]}" else echo "ERRORE Manca DC TITLE" exit fi #display DC for i in {0..8} do echo "${DCN[$i]} = ${DCV[$i]}" done #check index while read line; do lline=${#line} if [ $lline -gt 0 ] then dcvar="${line%%:*}" dcval="${line#*:}" pc=${line:0:1} if !([[ "$pc" < "0" ]] || [[ "$pc" > "9" ]]) then case $dcvar in [0-9][0-9][0-9][0-9]) echo "$dcval"" pag.""$dcvar" let "nind += 1" linkind="$linkind""||$bookPID-$dcvar|$dcval" ;; *) echo "ERRORE indice: $dcval pag. $dcvar" exit ;; esac fi fi done < "$filedcindice" rm "$filedcindice" #### book ingesting #### ######################## #create datastream INDEX file fileindex="/srv/temp/idx.idx" if [ -e $fileindex ]; then rm $fileindex fi echo "$linkind" > $fileindex #create book thumbnail image IFS=$(echo -en "\n\b") filetiff=$(find "$imagedir"*0001.tif -type f) IFS=$SAVEIFS if [ ! -e $filetiff ]; then echo "ERRORE file TIFF 0001 per TN $filetiff" exit fi #uncompress tiffcp -c none "$filetiff" "/srv/temp/tiff.tmp" #NOT gray scale cp "/srv/temp/tiff.tmp" "/srv/temp/tiff.tif" rm "/srv/temp/tiff.tmp" #create thumbnail filetn="/srv/temp/tnbook.jpg" risconvert=$(convert "/srv/temp/tiff.tif"[0] -thumbnail 200x225 -fuzz 1% +repage -gravity center -format jpg -quality 100 "$filetn") if [ ! -e $filetn ] || [ ! -f $filetn ] || [ ! -s $filetn ] then echo "ERROR file $filetn : $risconvert" exit fi #convert to RGB (IE8 compatibility) risconvert=$(convert "$filetn" -colorspace RGB "$filetn") if [ ! -e $filetn ] || [ ! -f $filetn ] || [ ! -s $filetn ] then echo "ERROR file $filetn : $risconvert" exit fi #prepare template cp /home/giancarlo/clineFC/import/templateBookMaster.xml /home/giancarlo/clineFC/import/templateBook.xml pidlabel=${DCV[0]} pidlabel=${DCV[0]//"/"/"\/"} pidlabel=${pidlabel//"&"/"\&"} sed -i "s/\[PID\]/$bookPID/g;s/\[PIDLABEL\]/$pidlabel/g;s/\[COLLEZIONE\]/$COLLEZIONE/g" /home/giancarlo/clineFC/import/templateBook.xml for i in {0..8} do var=${DCN[$i]//"["/"\["} var=${var//"]"/"\]"} valo=${DCV[$i]//"/"/"\/"} valo=${valo//"&"/"\&"} sed -i "s/$var/$valo/g" /home/giancarlo/clineFC/import/templateBook.xml done ftn=${filetn//"/"/"\/"} findex=${fileindex//"/"/"\/"} fpdf=${filepdf//"/"/"\/"} sed -i "s/\[FILETN\]/$ftn/g;s/\[FILEINDEX\]/$findex/g;s/\[FILEPDF\]/$fpdf/g" /home/giancarlo/clineFC/import/templateBook.xml if [[ "$3" != test ]] then #ingest page risultato=$(/usr/local/fedora/client/bin/fedora-ingest.sh f /home/giancarlo/clineFC/import/templateBook.xml info:fedora/fedora-system:FOXML-1.1 fc1.to.cnr.it:8080 fedoraAdmin $3 http) if [[ "$risultato" == Ingested* ]] then echo "$risultato" else echo "ERROR INGESTING $risultato" exit fi else #or test echo "-> INGESTED(test)" fi if [[ "$3" != test ]] then sleep 120 else echo "NO wait" fi #### pages ingesting #### ######################### if [ $4 -gt 0 ] then finoa=$4 else finoa=ntiff fi for (( npage=1; npage<=finoa; npage++ )) do #for every page snpage=$(printf "%04d" $npage) pagePID="$bookPID""-""$snpage" valo=${DCV[0]//"/"/"\/"} valo=${valo//"&"/"\&"} pagePIDlabel="$valo"" - page ""$snpage" IFS=$(echo -en "\n\b") filetiff=$(find "$imagedir"*$snpage.tif -type f) IFS=$SAVEIFS if [ ! -e $filetiff ] || [ ! -f $filetiff ] || [ ! -s $filetiff ] then if [[ "$3" != test ]] then exit else echo "$snpage -> ERROR file $filetiff" fi fi #uncompress tiffcp -c none "$filetiff" "/srv/temp/tiff.tmp" #to gray scale convert "/srv/temp/tiff.tmp" -colorspace Gray "/srv/temp/tiff.tif" rm "/srv/temp/tiff.tmp" filetiff="/srv/temp/tiff.tif" filetn="/srv/temp/page-tn.jpg" #page thumbnail risconvert=$(convert "$filetiff"[0] -thumbnail 200x225 -fuzz 1% +repage -gravity center -format jpg -quality 100 "$filetn") if [ ! -e $filetn ] || [ ! -f $filetn ] || [ ! -s $filetn ] then if [[ "$3" != test ]] then exit else echo "$snpage -> ERROR file $filetn : $risconvert" fi fi #page jpeg2k filejp2="/srv/temp/jp2.jp2" riscompres=$(kdu_compress -i "$filetiff" -o "$filejp2" -rate 0.5 Clayers=1 Clevels=7 "Cprecincts={256,256},{256,256},{256,256},{128,128},{128,128},{64,64},{64,64},{32,32},{16,16}" "Corder=RPCL" "ORGgen_plt=yes" "ORGtparts=R" "Cblk={32,32}" Cuse_sop=yes) if [ ! -e $filejp2 ] || [ ! -f $filejp2 ] || [ ! -s $filejp2 ] then if [[ "$3" != test ]] then exit else echo "$snpage -> ERROR file $filejp2 : $riscompres" fi fi #OCR file IFS=$(echo -en "\n\b") fileocr=$(find "$ocrdir"/*$snpage.txt -type f) IFS=$SAVEIFS #cut FF oct 014 char tr -d '\014' < "$fileocr" > "/srv/temp/ocr.tmp" #from dos to unix fromdos "/srv/temp/ocr.tmp" fileocrtmp="/srv/temp/ocr.tmp" if [ ! -e $fileocrtmp ] || [ ! -f $fileocrtmp ] then if [[ "$3" != test ]] then exit else echo "$snpage -> ERROR file $fileocrtmp" fi fi filel=$(stat -c %s $fileocrtmp) filenull=4 #check zero lenght if [ ! -s $fileocrtmp ] || [ "$filel" -le "$filenull" ] then cp blank.txt $fileocrtmp echo "blank file OCR" fi #check utf-8 or us-ascii filetype=$(file -bi $fileocrtmp) if [[ "$filetype" != *utf-8 ]] && [[ "$filetype" != *us-ascii ]] then if [[ "$3" != test ]] then exit else echo "$snpage -> ERROR file $fileocrtmp" fi fi awk '{if(NR==1)sub(/^\xef\xbb\xbf/,"");print}' "/srv/temp/ocr.tmp" > "/srv/temp/ocr.ocr" rm "/srv/temp/ocr.tmp" fileocrtmp="/srv/temp/ocr.ocr" #prepare template cp /home/giancarlo/clineFC/import/templatePageMaster.xml /home/giancarlo/clineFC/import/templatePage.xml sed -i "s/\[PID\]/$pagePID/g;s/\[PIDLABEL\]/$pagePIDlabel/g" /home/giancarlo/clineFC/import/templatePage.xml sed -i "s/\[DCTITLE\]/$pagePIDlabel/g;s/\[PIDBOOK\]/$bookPID/g" /home/giancarlo/clineFC/import/templatePage.xml #without DCTITLE for i in {1..8} do var=${DCN[$i]//"["/"\["} var=${var//"]"/"\]"} valo=${DCV[$i]//"/"/"\/"} valo=${valo//"&"/"\&"} sed -i "s/$var/$valo/g" /home/giancarlo/clineFC/import/templatePage.xml done ftn=${filetn//"/"/"\/"} fjp2=${filejp2//"/"/"\/"} focr=${fileocrtmp//"/"/"\/"} ftiff=${filetiff//"/"/"\/"} sed -i "s/\[FILETN\]/$ftn/g;s/\[FILEJP2\]/$fjp2/g;s/\[FILEOCR\]/$focr/g" /home/giancarlo/clineFC/import/templatePage.xml sed -i "s/\[FILETIFF\]/$ftiff/g" /home/giancarlo/clineFC/import/templatePage.xml if [[ "$3" != test ]] then #ingest page risultato=$(/usr/local/fedora/client/bin/fedora-ingest.sh f /home/giancarlo/clineFC/import/templatePage.xml info:fedora/fedora-system:FOXML-1.1 fc1.to.cnr.it:8080 fedoraAdmin $3 http) if [[ "$risultato" == Ingested* ]] then echo "$snpage -> $risultato" else echo "$snpage -> ERROR INGESTING $risultato" exit fi else #or test echo "$snpage -> INGESTED(test)" fi done