# 'cfoo5' program in Perl generates superclusters and html pages # 2001 J. M. Ward $num_seqs = 4752; #needs to be set to the number of sequences $name1 = "hydro.dat"; #contains selected TMS protein sequences $name2 = "famlist25"; #contains results from pairwise alignment $name3 = "atxgfam.dat"; #contains tc families for each accession $name4 = "tclinks.dat"; #contains links to tc familes $name5 = "dict.tms"; #dictionary of N-terminal sequence by family names $width = 83; #line width for printing $color = "blue"; $name6 = "seq.dat"; #new file with results for each sequence $name7 = "fam.dat"; #new file with results for each family %rec = (); # contains a record for each sequence @fam = (); # the list of close sequences %fam_name = (); # hash of family names @family_names = (); # names of families by number %sf_num = (); # hash of super family number %atxg = (); # hash of accessions by family %tclink = (); # hash of tc links @sf_by_num = (); # super families by family number $sfn = 12; # number of super families $unknownsf = 11; # sf for unknown seq open(SFILE, $name1) or die 'Could not open Fasta file'; open(FFILE, $name2) or die 'Could not open family list file'; open(DFILE, $name5) or die 'Could not open family name file'; open(TCFILE, $name3) or die 'Could not open atxg file'; open(TCLFILE, $name4) or die'Could not open tclinks file'; while (($ftemp, $ltemp) = split '\?', , 2) { # input tclinks by family chomp $ltemp; $tclink{$ftemp} = $ltemp; } close (TCLFILE); while (($atemp, $ftemp) = split ' ', , 2) { # input tc fam names by atxgxx accession chomp $ftemp; $ftemp =~ s/\s+$//; $atxg{$atemp} = $ftemp; } close (TCFILE); print "Reading sequences into %rec... \n"; init_record (SFILE); print "Generating similarity arrays... \n"; for $i (0..($num_seqs-1)) { @{$fam[$i]} = (); $t = ; chomp $t; @tmp = split(" ", $t); if (scalar (@tmp) > 1) { $t = shift @tmp; @{$fam[$i]} = @tmp; } } $fam_counter = 1; print "Clustering families... \n"; for $i (0..($num_seqs-1)) { if ($rec{$i}{family} == 0 ) { # if no family is assigned @set = (); @set = familysize ($i, \@set, \@{$fam[$i]}); # collect the families if (scalar @set) { # if there are similar sequences push (@set, $i); foreach $j (@set) { if ($rec{$j}{family} != 0) { # reset the fam number fix_fam($num_seqs-1, $fam_counter, $rec{$j}{family}); # because familysize is recursive } $rec{$j}{family} = $fam_counter; } $fam_counter++; } } } print "Clustering complete \n"; for $k (0..($num_seqs-1)) { if (!in($rec{$k}{family},@family_list)) { # if family not on list push @family_list, $rec{$k}{family}; } } print "There are ", scalar(@family_list), " families in the dataset \n"; print "Naming families...\n"; while (($nterm, $sf, $fname) = split ' ', , 3) { # input family names from dictionary chomp $fname; $fam_name{$nterm} = $fname; $sf_num{$nterm} = $sf; } close (DFILE); for $k (0..($num_seqs-1)) { if ($rec{$k}{family} != 0) { # is seq in a family foreach $key (keys (%fam_name)) { # go through the dictionary if ($rec{$k}{sequence} =~ /^$key/) { #if a dictionary key matches the sequence $family_names[$rec{$k}{family}] = $fam_name{$key}; # set the family name for the family $sf_by_num[$rec{$k}{family}] = $sf_num{$key}; # set the super family number last; } } } } $family_names[0] = "Sequences not placed in a family (singletons)"; print "Beginning to output main HTML files... \n"; for $ij (1..$sfn) { #initialize the html file open (HTFILE, ">sf".$ij."\.htm") or die 'Could not open sf file'; print HTFILE ""; print HTFILE "

Families

\n
"; close (HTFILE); } foreach $i (@family_list) { if ($i > 0) { $fn = "fam".$i; open (HTFILE2, ">$fn"."\.htm") or die 'Could not open html file'; #html file if (! -d 'fasta') { mkdir('fasta', 0755); } #if fasta folder doesn't exist, make it chdir ('fasta'); open (HTFILE4, ">$fn"."\.fas") or die 'Could not open fasta file'; #fasta file chdir('::'); print HTFILE2 " \n "; #background for file print HTFILE2 "\n"; print HTFILE2 "\n"; print HTFILE2 "

View family alignment

Fasta file of sequences

Relatives


\n"; $numseq = 0; for $j (0..($num_seqs-1)) { if ($rec{$j}{family} == $i) { $numaa = length ($rec{$j}{sequence}) - 1; #$rec{$j}{name}=~ s/(\w+)\.(.+)/$1/; print HTFILE2 " \n "; print HTFILE2 "

",$rec{$j}{name},"\t"; print HTFILE2 "","Link to TAIR","\t"; print HTFILE2 "","Link to Aramemnon","\t"; print HTFILE2 "\(",$numaa,"aa) ", $rec{$j}{tms}," TMS\, N-term is ",$rec{$j}{direct},"

"; $rec{$j}{description} =~ s/\#(.+?)\#/ \$1<\/A> /g; $rec{$j}{description} =~ s/(emb\||gb\:|sp\||gb\|)([a-zA-Z0-9\.]+)(\,|\)|\W)/$1 \$2<\/A>$3/ig; foreach $key (keys (%atxg)) { if ($rec{$j}{name} =~ /^$key/i) { $rec{$j}{description} .= "\, in $tclink{$atxg{$key}}"; } } $rec{$j}{description} =~ s/at(\w+)\.(\d)//i; print HTFILE2 "

",$rec{$j}{description}, "

\n
";
      printp($rec{$j}{sequence},*HTFILE2, $width, $color, 0, split("\,",$rec{$j}{posit})); 
      print HTFILE2 "\n
\n"; print HTFILE2 "\n \n"; print HTFILE4 ">",($numseq + 1),":",$rec{$j}{name}, "\n"; # prints fasta file print1($rec{$j}{sequence},*HTFILE4); $numseq++; } } close HTFILE2; close HTFILE4; if ($sf_by_num[$i]) { $fn2 = "sf".$sf_by_num[$i]; } elsif ($i != 0) { $fn2 = "sf$unknownsf"; } elsif($i == 0) { $fn2 = "temp"; } open (HTFILE, ">>$fn2"."\.htm") or die 'Could not open file'; print HTFILE "
", $family_names[$i], " family \(", $numseq, " members\)
\n"; if ($family_names[$i] ne "") {$family_names[$i] .= " family ($numseq members)"} close(HTFILE); } } for $ij (1..$sfn) { # new section in sf files open (HTFILE, ">>sf".$ij."\.htm") or die 'Could not open sf file'; print HTFILE "

Single Sequences

\n
"; close (HTFILE); } print "Collecting singletons...\n"; open (HTFILE2, ">fam0\.htm") or die 'Could not open file'; print HTFILE2 " \n
"; close (HTFILE2); $id = 0; for $i (0..($num_seqs-1)) { if ($rec{$i}{family} == 0) { # for each singleton foreach $key (keys (%fam_name)) { # go through the dictionary if ($rec{$i}{sequence} =~ /^$key/) { #if a dictionary key matches the sequence $fn2 = "sf".$sf_num{$key}; $rec{$i}{sf} = $sf_num{key}; $rec{$i}{family} = $fam_counter; $family_names[$rec{$i}{family}] = "Singleton: ".$fam_name{$key}; # set the name for the singleton push @family_list, $rec{$i}{family}; $fn = "sin".$fam_counter; open (HTFILE, ">>$fn2"."\.htm") or die 'Could not open file'; print HTFILE "
", $fam_name{$key}, "
\n"; close(HTFILE); open (HTFILE2, ">$fn"."\.htm") or die 'Could not open html file'; #html file print HTFILE2 " \n
"; #background for file print HTFILE2 "

Relatives


\n"; $id = 1; $fam_counter++; last; } } if (! $id) { open (HTFILE2, ">>fam0\.htm") or die 'Could not open file'; } $numaa = length ($rec{$i}{sequence}) - 1; #$rec{$i}{name}=~ s/(\w+)\.(.+)/$1/; print HTFILE2 " \n "; print HTFILE2 "

",$rec{$i}{name},"\t"; print HTFILE2 "","Link to TAIR","\t"; print HTFILE2 "","Link to Aramemnon","\t"; print HTFILE2 "\(",$numaa,"aa) ", $rec{$i}{tms}," TMS\, N-term is ",$rec{$i}{direct},"

"; $rec{$i}{description} =~ s/\#(.+?)\#/ \$1<\/A> /g; $rec{$i}{description} =~ s/(emb\||gb\:|sp\||gb\|)([a-zA-Z0-9\.]+)(\,|\)|\W)/$1 \$2<\/A>$3/ig; foreach $key (keys (%atxg)) { if ($rec{$i}{name} =~ /^$key/i) { $rec{$i}{description} .= "\, in $tclink{$atxg{$key}}"; } } $rec{$i}{description} =~ s/at(\w+)\.(\d)//i; print HTFILE2 "

",$rec{$i}{description}, "

\n
";
     printp($rec{$i}{sequence},*HTFILE2, $width, $color, 0, split ("\,",$rec{$i}{posit})); 
     print HTFILE2 "\n
\n"; print HTFILE2 "\n \n"; close(HTFILE2); if ($id) { if (! -d 'fasta') { mkdir('fasta', 0755); } #if fasta folder doesn't exist, make it chdir ('fasta'); open (HTFILE4, ">$fn"."\.fs") or die 'Could not open fasta file'; chdir('::'); print HTFILE4 ">",$rec{$i}{name}, "\n"; # prints fasta file print1($rec{$i}{sequence},*HTFILE4); } $id = 0; } } print "Finishing... \n"; open (OUT1, ">$name6") or die 'Could not open new seq file'; for $i (0..($num_seqs-1)) { # $rec{$i}{description} =~ s/<.+?>//g; #strip out the tags if ($rec{$i}{family} == 0) {$sf = $rec{$i}{sf} || $unknownsf} else {$sf = $sf_by_num[$rec{$i}{family}] || $unknownsf} print OUT1 "$i $rec{$i}{name} $sf $rec{$i}{family} $rec{$i}{description}\n" } close (OUT1); open (OUT2, ">$name7") or die 'Could not open new family file'; foreach $i (@family_list) { print OUT2 "$i $family_names[$i]\n"; } close (OUT2); sub init_record { $fn = shift; $ij = 0; while (($tp, $name, $direct, $tms, $gif, $desc) = split ' ', <$fn>, 6) { chomp($desc); $_ = $desc; /(at(\w+))\.(\d)\s/i; $rec{$ij}{name} =$1; #$rec{$ij}{name}=~ s/(\w+)\.(.+)/$1/; $desc =~ s/\$(.+)\$//; $rec {$ij} {posit} = $1; $rec{$ij}{direct} = $direct; $rec{$ij}{tms} = $tms; $rec{$ij}{gif} = $gif; $rec{$ij}{description} = $desc; $rec{$ij}{family} = 0; $rec{$ij}{sf} = 0; $rec{$ij}{sequence} = <$fn>; $ij++; } } sub fix_fam { my ($i, $new_fam, $preset) = @_; for $ki (0..$i) { if ($rec{$ki}{family} == $preset) { $rec{$ki}{family} = $new_fam; } } } sub numerically {$a <=> $b;} sub in { my $j = shift; my @list = @_; foreach $iin (@list) { if ($iin == $j) {return 1;} } return 0; } sub familysize { ($t, $ref1, $ref2) = @_; @a = @$ref1; @b = @$ref2; if (! scalar @b) {return @a} else { $t = shift @b; push (@a, $t); push (@b, @{$line[$t]}); @{$line[$t]} = (); %seen = (); foreach $e (@b) {$seen{$e} = 1} foreach $e (@a) {delete($seen{$e}) }; @b = keys %seen; return familysize ($t, \@a, \@b); } } sub print1 { #prints a string to $width my $line = shift; my $fh = shift; print $fh substr($line, 0 , $width), "\n"; #print the first line if (length ($line) > $width) { print1 (substr ($line, $width, ), $fh); #print1 the rest } } sub printp { #prints a string to $width and adds html tags for $color at positions in @list my ($line, $fh, $width, $color, $start, @list ) = @_; my $str = substr($line, 0 , $width); my @listcop = @list; my @nextlist = (); my @parts = (); $indx = 0; foreach $pos (@list) { if ($pos <= $width) { shift (@listcop); if (!$start) { $start = 1; push (@parts, (substr($str, $indx, $pos - 1 - $indx,) . "")); $indx = $pos-1; } else { push (@parts, (substr($str, $indx, $pos - $indx ,) . "<\/font>")); $start = 0; $indx= $pos; } } } if ($indx) { push (@parts, (substr($str,$indx))); $str = join('',@parts); } foreach $pos (@listcop) { push (@nextlist , ($pos - $width)); } print $fh $str,"\n"; #print the line if (length ($line) > $width) { printp (substr ($line, $width, ), $fh, $width, $color, $start, @nextlist); #printp the rest } }