2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
17 # this script tries to build repeat consensus sequences
18 # for TRF repeats that are unique and minimal.This should reduce the amount
19 # of TRF consensus objects.
21 # further it will replace all repeat consensi longer than 8 characters with
22 # just one for the length.
30 my ( $host, $user, $pass, $port, $dbname );
32 GetOptions(
"host=s", \$host,
39 if( !$host || !$dbname ) {
43 my $dsn =
"DBI:mysql:host=$host;dbname=$dbname";
45 $dsn .=
";port=$port";
48 # retrive all consensi of length 1..8
49 # build the normal consensus ( rotate, revcomp and build alphabetical minimum )
51 my $db = DBI->connect( $dsn, $user, $pass );
53 # check if the trfs have the uncompressed format
55 my $res = $db->selectall_arrayref(
"select count(*) from repeat_consensus where repeat_consensus rlike \"\\\\(\"" );
57 if( $res->[0]->[0] > 0 ) {
58 print STDERR
"Database might alread be converted, contains \"(\" in repeat_consensus.\n";
63 my $filename =
"tmp_old_new_rcid.txt";
64 open ( FH,
">$filename" ) or die ( "Couldnt write $filename " );
69 my $all_cons_count = 0;
70 my $sth = $db->prepare(
"select repeat_consensus_id, repeat_consensus " .
71 "from repeat_consensus where repeat_class = \"trf\" ".
72 "AND length(repeat_consensus) = $i" );
76 my ( %rc_2_norm, %rc_2_seq, %norm_2_rc );
78 while( my $arr = $sth->fetchrow_arrayref() ) {
79 my ( $rc_id, $rc_seq ) = @$arr;
82 $rc_2_norm{ $rc_id } = $norm_seq;
83 $rc_2_seq{ $rc_id } = $rc_seq;
84 if( $norm_seq eq $rc_seq ) {
85 $norm_2_rc{ $norm_seq } = $rc_id;
89 # iterate through the rc_ids and write old tab new lines out
90 for my $rc_id ( keys %rc_2_norm ) {
91 my $norm = $rc_2_norm{ $rc_id };
92 if( exists $norm_2_rc{ $norm } ) {
93 my $new_rc = $norm_2_rc{ $norm };
94 if( $rc_id != $new_rc ) {
95 print FH
"$rc_id\t$new_rc\n";
98 $norm_2_rc{ $norm } = $rc_id;
103 my $sth = $db->prepare(
"SELECT repeat_consensus_id, length( repeat_consensus ) " .
104 "FROM repeat_consensus WHERE repeat_class = \"trf\" " .
105 "AND length(repeat_consensus) > 8" );
110 my $length_removal_count = 0;
112 while( my $arr = $sth->fetchrow_arrayref() ) {
113 my ( $rc_id, $length ) = @$arr;
114 if( exists $len_2_rc{$length} ) {
115 # map this rc to the first one of that length
116 print FH
"$rc_id\t".$len_2_rc{ $length }.
"\n";
119 $len_2_rc{ $length } = $rc_id;
123 print STDERR
"File written ";
126 $db->do(
"create table tmp_old_new_rcid ( old_id int not null, new_id int not null, key old_idx( old_id))" );
128 $db->do(
"load data local infile '$filename' into table tmp_old_new_rcid" );
130 print STDERR
"and uploaded.\n";
132 $db->do(
"update repeat_feature rf, tmp_old_new_rcid tonr " .
133 "set rf.repeat_consensus_id = tonr.new_id " .
134 "where rf.repeat_consensus_id = tonr.old_id " );
136 print STDERR
"Repeat_features updated.\n";
138 $db->do(
"delete from rc " .
139 "using repeat_consensus rc, tmp_old_new_rcid tonr " .
140 "where rc.repeat_consensus_id = tonr.old_id " );
142 $db->do(
"update repeat_consensus " .
143 "set repeat_consensus = concat( length( repeat_consensus ), \"(N)\") " .
144 "where repeat_class = \"trf\" " .
145 "and length( repeat_consensus ) > 8" );
147 $db->do(
"drop table tmp_old_new_rcid" );
155 # find a representation of the input sequence that is canonical
156 # rotate and revcomp all possibilities and take alphabetical lowest
162 push( @all_seq, $seq );
164 for( my $i=1; $i<2*length( $seq ); $i++ ) {
165 if( $i != length( $seq )) {
166 $test_seq =
rotate( $test_seq );
170 my $new_seq = $test_seq;
171 push( @all_seq, $new_seq );
174 # print "old: ",join( " ", @all_seq),"\n";
175 @all_seq = sort {$a cmp $b} @all_seq;
176 # print "new: ",join( " ", @all_seq),"\n";
184 return substr( $string, 1 ) . substr( $string, 0, 1 );
189 $string = reverse( $string );
191 tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
199 Usage: perl shrink_trfs.pl -user .. -port .. -pass .. -host .. -dbname ..
200 Tries to minimize the number of repeat_consensi by only having one
201 trf consensus
for each length of repeat_consensus longer than 8.
202 Shorter repeat consensi are reduced to the alphabetical minimum
203 of all
rotate / revcomp equivalent repeats.