ensembl-hive  2.7.0
shrink_trfs.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 # this script tries to build repeat consensus sequences
18 # for TRF repeats that are unique and minimal.This should reduce the amount
19 # of TRF consensus objects.
20 
21 # further it will replace all repeat consensi longer than 8 characters with
22 # just one for the length.
23 
24 use strict;
25 use warnings;
26 
27 use DBI;
28 use Getopt::Long;
29 
30 my ( $host, $user, $pass, $port, $dbname );
31 
32 GetOptions( "host=s", \$host,
33  "user=s", \$user,
34  "pass=s", \$pass,
35  "port=i", \$port,
36  "dbname=s", \$dbname
37  );
38 
39 if( !$host || !$dbname ) {
40  usage();
41 }
42 
43 my $dsn = "DBI:mysql:host=$host;dbname=$dbname";
44 if( $port ) {
45  $dsn .= ";port=$port";
46 }
47 
48 # retrive all consensi of length 1..8
49 # build the normal consensus ( rotate, revcomp and build alphabetical minimum )
50 
51 my $db = DBI->connect( $dsn, $user, $pass );
52 
53 # check if the trfs have the uncompressed format
54 
55 my $res = $db->selectall_arrayref( "select count(*) from repeat_consensus where repeat_consensus rlike \"\\\\(\"" );
56 
57 if( $res->[0]->[0] > 0 ) {
58  print STDERR "Database might alread be converted, contains \"(\" in repeat_consensus.\n";
59  exit;
60 }
61 
62 
63 my $filename = "tmp_old_new_rcid.txt";
64 open ( FH, ">$filename" ) or die ( "Couldnt write $filename " );
65 
66 for my $i ( 1..8 ) {
67 
68  my $remap_count = 0;
69  my $all_cons_count = 0;
70  my $sth = $db->prepare( "select repeat_consensus_id, repeat_consensus " .
71  "from repeat_consensus where repeat_class = \"trf\" ".
72  "AND length(repeat_consensus) = $i" );
73 
74  $sth->execute();
75 
76  my ( %rc_2_norm, %rc_2_seq, %norm_2_rc );
77 
78  while( my $arr = $sth->fetchrow_arrayref() ) {
79  my ( $rc_id, $rc_seq ) = @$arr;
80  $all_cons_count++;
81  my $norm_seq = norm_seq( $rc_seq );
82  $rc_2_norm{ $rc_id } = $norm_seq;
83  $rc_2_seq{ $rc_id } = $rc_seq;
84  if( $norm_seq eq $rc_seq ) {
85  $norm_2_rc{ $norm_seq } = $rc_id;
86  }
87  }
88 
89  # iterate through the rc_ids and write old tab new lines out
90  for my $rc_id ( keys %rc_2_norm ) {
91  my $norm = $rc_2_norm{ $rc_id };
92  if( exists $norm_2_rc{ $norm } ) {
93  my $new_rc = $norm_2_rc{ $norm };
94  if( $rc_id != $new_rc ) {
95  print FH "$rc_id\t$new_rc\n";
96  }
97  } else {
98  $norm_2_rc{ $norm } = $rc_id;
99  }
100  }
101 }
102 
103 my $sth = $db->prepare( "SELECT repeat_consensus_id, length( repeat_consensus ) " .
104  "FROM repeat_consensus WHERE repeat_class = \"trf\" " .
105  "AND length(repeat_consensus) > 8" );
106 
107 $sth->execute();
108 
109 my %len_2_rc;
110 my $length_removal_count = 0;
111 
112 while( my $arr = $sth->fetchrow_arrayref() ) {
113  my ( $rc_id, $length ) = @$arr;
114  if( exists $len_2_rc{$length} ) {
115  # map this rc to the first one of that length
116  print FH "$rc_id\t".$len_2_rc{ $length }."\n";
117 
118  } else {
119  $len_2_rc{ $length } = $rc_id;
120  }
121 }
122 
123 print STDERR "File written ";
124 
125 close( FH );
126 $db->do( "create table tmp_old_new_rcid ( old_id int not null, new_id int not null, key old_idx( old_id))" );
127 
128 $db->do( "load data local infile '$filename' into table tmp_old_new_rcid" );
129 
130 print STDERR "and uploaded.\n";
131 
132 $db->do( "update repeat_feature rf, tmp_old_new_rcid tonr " .
133  "set rf.repeat_consensus_id = tonr.new_id " .
134  "where rf.repeat_consensus_id = tonr.old_id " );
135 
136 print STDERR "Repeat_features updated.\n";
137 
138 $db->do( "delete from rc " .
139  "using repeat_consensus rc, tmp_old_new_rcid tonr " .
140  "where rc.repeat_consensus_id = tonr.old_id " );
141 
142 $db->do( "update repeat_consensus " .
143  "set repeat_consensus = concat( length( repeat_consensus ), \"(N)\") " .
144  "where repeat_class = \"trf\" " .
145  "and length( repeat_consensus ) > 8" );
146 
147 $db->do( "drop table tmp_old_new_rcid" );
148 
149 unlink( $filename );
150 
151 exit;
152 
153 
154 
155 # find a representation of the input sequence that is canonical
156 # rotate and revcomp all possibilities and take alphabetical lowest
157 sub norm_seq {
158  my $seq = shift;
159  my $test_seq = $seq;
160 
161  my @all_seq = ();
162  push( @all_seq, $seq );
163 
164  for( my $i=1; $i<2*length( $seq ); $i++ ) {
165  if( $i != length( $seq )) {
166  $test_seq = rotate( $test_seq );
167  } else {
168  $test_seq = rev_comp( $test_seq );
169  }
170  my $new_seq = $test_seq;
171  push( @all_seq, $new_seq );
172  }
173 
174 # print "old: ",join( " ", @all_seq),"\n";
175  @all_seq = sort {$a cmp $b} @all_seq;
176 # print "new: ",join( " ", @all_seq),"\n";
177  return $all_seq[0];
178 }
179 
180 
181 
182 sub rotate {
183  my $string = shift;
184  return substr( $string, 1 ) . substr( $string, 0, 1 );
185 }
186 
187 sub rev_comp {
188  my $string = shift;
189  $string = reverse( $string );
190  $string =~
191  tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
192  return $string;
193 }
194 
195 sub usage {
196 
197  print STDERR <<EOC
198 
199  Usage: perl shrink_trfs.pl -user .. -port .. -pass .. -host .. -dbname ..
200  Tries to minimize the number of repeat_consensi by only having one
201  trf consensus for each length of repeat_consensus longer than 8.
202  Shorter repeat consensi are reduced to the alphabetical minimum
203  of all rotate / revcomp equivalent repeats.
204 
205 EOC
206 ;
207  exit;
208 }
usage
public usage()
norm_seq
public norm_seq()
rotate
public rotate()
rev_comp
public rev_comp()