#!/usr/bin/perl

#     One code to find them all  -- perl utility ot extract information from RepeatMasker output files
#     Copyright (C) 2014  Bailly-Bechet Marc
# 
#     This program is free software; you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation; either version 3 of the License, or
#     (at your option) any later version.
# 
#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
# 
#     You should have received a copy of the GNU General Public License
#     along with this program. If not, see <http://www.gnu.org/licenses/>.


### This script sums all *copynumber* files
### from a previous run of one_code_to_find_them_all.pl

############### SCRIPT USAGE #####################
### To use, simply run:
###
### ./sum_copynumber.pl --dir directory_name
### 
### where  directory_name is the name of a directory
### containing multiple *copynumber.csv files
### (the script is recursive)
### The sum of all these files will be printed in standard output
###
### To redirect this output in a file filename do:
###
### ./sum_copynumber.pl --dir directory_name > filename
###
###################################################



use FileHandle;
use Getopt::Long;
use File::Find;
use File::Basename;

GetOptions('dir=s' => \$dir);

unless(-d $dir){die("Dying! Option --dir must be given a directory argument\n")}

%mem=();
%smem=();

find(\&Wanted_Copynumber, $dir);

sub Wanted_Copynumber{
  if(/.*\.copynumber\.csv$/){
    warn "Found copynumber file $_\n";
    open COPY,"<$_" or die("Cannot open file $_; dying");
    while(<COPY>){
      next if(/^Family/);
      chomp;
      @F=split;
      $name=$F[1]."||".$F[2];
      if($F[0] =~ /^DNA/){
	$group="DNA";
      }
       if($F[0] =~ /^LINE/){
	$group="LINE";
      }
       if($F[0] =~ /^SINE/){
	$group="SINE";
      }
       if($F[0] =~ /^LTR/){
	$group="LTR";
      }
      for($i=3;$i<=6;$i++){
	if($F[0] =~ /^\#/){
	  if($F[$i] eq "NA"){
	    $smem{$F[0]}[$i-3]+=0;
	  }else{
	    $smem{$F[0]}[$i-3]+=$F[$i];
	  }
	}else{
	  if($F[$i] eq "NA"){
	    $mem{$group}{$F[0]}{$name}[$i-3]+=0;
	  }else{
	    $mem{$group}{$F[0]}{$name}[$i-3]+=$F[$i];
	  }
	}
      }
    }
  }
}

foreach $g ("DNA","LINE","SINE","LTR"){
  foreach $k (keys %{$mem{$g}}){
    foreach $elem (keys %{$mem{$g}{$k}}){
      @G= split /\|\|/,$elem;
      print "$k\t$G[0]\t$G[1]";
      for($i=0;$i<=3;$i++){
	print "\t$mem{$g}{$k}{$elem}[$i]";
      }
      print "\n";
    }
    $global="###".$k;
    print "$global\tAll_elements\tNA";
    for($i=0;$i<=3;$i++){
      print "\t$smem{$global}[$i]";
    }
    print "\n"
  }
  $global="######Type:".$g;
  print "$global\tAll_elements\tNA";
  for($i=0;$i<=3;$i++){
    print "\t$smem{$global}[$i]";
  }
  print "\n";
}

foreach $other ("#########Type:EVERYTHING_TE", "######Type:Low_complexity", "######Type:Satellite", "######Type:Simple_repeat", "######Type:Unknown"){
  print "$other\tAll_elements\tNA";
   for($i=0;$i<=3;$i++){
    print "\t$smem{$other}[$i]";
  }
  print "\n";
}



