#!/bin/bash

# ----------------------------------------------------------------------
# Prepare le shell search_duplicates.bash permettant de chercher
# et lister les fichiers en doubles, puis le lance.
# 
# On classe les fichiers par taille, en octets, et on
# fait des cmp sur les fichiers de meme taille.
# Utilise le shell compare_file_size.bash
#
# un compte rendu search_duplicates.txt est cr.
#
# Patrick ROBERT, Novembre 2010
# ----------------------------------------------------------------------

appli=`basename $0`
appli36=`echo "$appli                                 " | cut -c1-36`
Narg=0

if( (test $# = 1 ) && (test $1 = -h ) ) ; then hh=1 ; else hh=0 ; fi

if test $# != $Narg || test $hh = 1
   then
   echo "$appli : search and list duplicate files in a tree"
   echo "$appli   require $Narg argument(s), ex:"
   echo "$appli   "
   echo ""
   if test $hh = 1 ; then exit 0 ; fi
   echo " $appli36 : *** ERROR ! Command aborted." >&2
   exit 1
fi

datim1=`date +%F'  '%H':'%M':'%S `
julsec1=`date +%s`

echo " "
echo " -------------------------------------------------------------------"
echo "run of search_duplicates from current directory :"
echo "`pwd`"
echo " "
echo "Computation of size catalog, please wait...."

# on a besoin de 9 fichiers temporaires

for i in 0 1 2 3 4 5 6 7 8 9 10 11 12
do
   if test -f toto$i.tmp ; then rm toto$i.tmp ; fi
done

# liste des fichiers dans toute l'arborescence
find . -type f > toto0.tmp

nbfiles=`wc -l toto0.tmp | cut -d" " -f1 `

echo "Number of files to compare: $nbfiles"
echo "   1. Making catalog of various sizes..."

# on ne compare pas tous les fichiers zz_fin.jpg
# on remplace les blanc et les parentheses dans les noms de fichier par des "?"

nbblf=`grep " " toto0.tmp | wc -l`
nbpaf=`grep "(" toto0.tmp | wc -l`

if test $nbblf != 0
   then
   echo "      * there is $nbblf files with names contains a white space"
   echo "      * there is $nbpaf files with names contains a parenthesis"
   echo "        (could be the same)"
   echo "      correction with '?' character done"
   echo " "
fi


cat toto0.tmp | sed -e "/zz_fin.jpg/d" -e "s/ /?/g" -e "s/(/?/" -e "s/)/?/" > toto1.tmp
echo '#!/bin/bash' > toto2.tmp

# creation du fichier de commande pour calculer les tailles en octets
sed "s/^/wc -c /" toto1.tmp >> toto2.tmp

# liste des tailles avec les paths, le tout avec tri sur les tailles
sh toto2.tmp | sort > toto3.tmp

# catalogue des tailles avec les doublons
sed "s/ .*$//" toto3.tmp > toto4.tmp

# catalogue des tailles sans doublons
sort -u toto4.tmp > toto5.tmp

# liste des tailles qui ont des doublons, par difference entre
# les 2 fichiers ci-dessus
# ce sont ces fichiers qui sont suseptibles d'etres identiques

diff toto4.tmp toto5.tmp | grep "<" | sed "s/< //" | sort -u  \
     | sed "/^0$/d" > toto6.tmp

nbsizes=`wc -l toto6.tmp |  cut -d" " -f1 `

echo "      Number of different sizes : $nbsizes"

if test $nbsizes -eq 0
   then echo "--> no duplicate files, all sizes are different"
   exit
fi

# fabrication du shell de recherche de doublons

echo "   2. making shell for searching duplicate files...."

# rajou du numero de ligne pour avoir connaissance de la progression

sed 's/^/xxxx/' toto6.tmp | cat -n | sed 's/xxxx.*//' > toto7.tmp
paste toto6.tmp toto7.tmp | sed -e 's/	//g' \
                                -e "s/$/\/$nbsizes $1/" > toto8.tmp
                                
if test -f search_duplicates_wd.bash
   then rm search_duplicates_wd.bash
fi

sed -e "s/^/compare_file_size.bash /" toto8.tmp > search_duplicates_wd.bash

# run of search_duplicates_wd.bash, see and act...

echo "==============================================="  > ./search_duplicates_wd1.txt
echo "`pwd`"                                           >> ./search_duplicates_wd1.txt
echo "Nb. of files to compare : $nbfiles"              >> ./search_duplicates_wd1.txt
echo "===============================================" >> ./search_duplicates_wd1.txt
echo "   3. Run shell in batch mode :"
echo " "

sh ./search_duplicates_wd.bash                         >> ./toto9.tmp
echo "==============================================="  > ./search_duplicates_wd3.txt
echo "`pwd`"                                           >> ./search_duplicates_wd3.txt
echo "Nb. of files compared : $nbfiles"                >> ./search_duplicates_wd3.txt
echo "===============================================" >> ./search_duplicates_wd3.txt
echo
echo -----------------------------------------------

sed "/^Compare/d" ./toto9.tmp > toto10.tmp
sed -e "/^--/d" -e "/^$/d" toto10.tmp > toto11.tmp
sed  'N;s/\n/ /' toto11.tmp | sort > toto12.tmp

tmp=`wc -l toto12.tmp`
nbli=`echo $tmp | cut -d " " -f1`

if test $nbli = 0
   then 
       echo "*** No duplicate files ***" > search_duplicates_wd2.txt
   else
       echo "$nbli comparisons to do :"   > search_duplicates_wd2.txt
       echo " "                          >> search_duplicates_wd2.txt
       cat toto12.tmp                    >> search_duplicates_wd2.txt
fi

cat search_duplicates_wd1.txt search_duplicates_wd2.txt search_duplicates_wd3.txt > search_duplicates_wd.txt

# menage des fichiers temporaires

for i in 0 1 2 3 4 5 6 7 8 9 10 11 12
do
  rm toto$i.tmp
done
   
rm search_duplicates_wd1.txt search_duplicates_wd2.txt search_duplicates_wd3.txt
rm ./search_duplicates_wd.bash

   
# unix2dos ./search_duplicates_wd.txt
   
echo "results are in ./search_duplicates_wd.txt"


echo " "
echo "search finished on $nbfiles files !"
echo -----------------------------------------------

# end of command
# --------------

datim2=`date +%F'  '%H':'%M':'%S `
julsec2=`date +%s`
diff=`expr $julsec2 - $julsec1`

echo " "
echo " Starting time $appli : $datim1"
echo " Ending   time $appli : $datim2"
echo " Duration      $appli : $diff sec."

echo
echo " $appli36 : NORMAL TERMINATION - time exe= $diff s." >&2

echo " -------------------------------------------------------------------"

