ensembl-hive  2.8.1
select_canonical_transcripts.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 # Script that selects the best candidate for canonical transcripts on
18 # each gene.
19 
20 # For usage instructions, run ./select_canonical_transcripts.pl -help
21 
22 
23 use strict;
24 use warnings;
25 
26 use Getopt::Long;
27 use IO::File;
28 
29 use Bio::EnsEMBL::Utils::Exception qw(throw);
33 
34 my ($host, $port, $dbname, $user, $pass);
35 my ($dnahost, $dnaport, $dnadbname, $dnauser, $dnapass);
36 my ($ccds_host, $ccds_dbname, $ccds_user, $ccds_port, $ccds_pass);
37 my ($log_path, $help);
38 
40 
41 my $coord_system_name;
42 my $seq_region_name;
43 
44 # keep as undefined unless you only want to run on a specific analysis
45 my $logic_name;
46 
47 my $write = 0;
48 my $include_non_ref = 1;
49 my $include_duplicates;
50 my $verbose = 0;
51 my $attrib_type_id;
52 
53 GetOptions( 'dbhost:s' => \$host,
54  'dbport:n' => \$port,
55  'dbname:s' => \$dbname,
56  'dbuser:s' => \$user,
57  'dbpass:s' => \$pass,
58  'dnadbhost:s' => \$dnahost,
59  'dnadbname:s' => \$dnadbname,
60  'dnadbuser:s' => \$dnauser,
61  'dnadbpass:s' => \$dnapass,
62  'dnadbport:s' => \$dnaport,
63  'ccdshost:s' => \$ccds_host,
64  'ccdsdbname:s' => \$ccds_dbname,
65  'ccdsuser:s' => \$ccds_user,
66  'ccdsport:s' => \$ccds_port,
67  'ccdspass:s' => \$ccds_pass,
68 
69  'coord_system_name:s' => \$coord_system_name,
70  'seq_region_name:s' => \$seq_region_name,
71 
72  'logic_name:s' => \$logic_name,
73  'write!' => \$write,
74  'include_non_ref!' => \$include_non_ref,
75  'include_duplicates' => \$include_duplicates,
76  'verbose!' => \$verbose,
77 
78  # log file used for analysing choices in bulk
79  'log:s' => \$log_path,
80  'help!' => \$help,
81  'h!' => \$help
82  ) or die "check options\n";
83 
84 if ($help) { &usage; exit; }
85 
86 my $dba =
88  -host => $host,
89  -user => $user,
90  -port => $port,
91  -dbname => $dbname,
92  -pass => $pass,
93  -species => 'default',
94  -no_cache => 1,
95  );
96 
97 if ($write) {
98  # Get attribute id for canonical
99  my $attrib_type_id_sth = $dba->dbc->prepare("SELECT attrib_type_id FROM attrib_type WHERE code=?");
100  $attrib_type_id_sth->execute('is_canonical');
101  ($attrib_type_id) = $attrib_type_id_sth->fetchrow_array();
102  if (!$attrib_type_id) {
103  throw ("No attrib_type_id found for 'is_canonical' attribute in attrib_type table");
104  }
105 } else {
106  print "You have not used the -write option "
107  . "so results will not be written into the database\n";
108 }
109 
110 if($dnadbname) {
111  if(!$dnauser || !$dnahost) {
112  throw ("You must provide user, host and dbname details ".
113  "to connect to DNA DB!");
114  }
115  my $dna_db =
117  -host => $dnahost,
118  -user => $dnauser,
119  -port => $dnaport,
120  -dbname => $dnadbname,
121  -pass => $dnapass,
122  -species => 'dna_'.$dba->species(),
123  );
124  $dba->dnadb($dna_db);
125 }
126 else {
127  my $dna = check_if_DB_contains_DNA($dba);
128  if(!$dna) {
129  throw ("Your gene DB contains no DNA. ".
130  "You must provide a DNA_DB to connect to");
131  }
132 }
133 
134 my $ccds_dba;
135 
136 if ($ccds_dbname) {
137  if (!$ccds_user || !$ccds_host) {
138  throw ("You must provide user, host and dbname details ".
139  "to connect to CCDS DB!");
140  }
141  $ccds_dba =
143  -host => $ccds_host,
144  -user => $ccds_user,
145  -port => $ccds_port,
146  -pass => $ccds_pass,
147  -dbname => $ccds_dbname,
148  -species => 'ccds_'.$dba->species(),
149  -no_cache => 1,
150  );
151 }
152 
153 my $log_fh;
154 if ($log_path) {
155  $log_fh = IO::File->new($log_path,"w")
156  or throw ("Could not open logging file.");
157 }
158 
159 my $transcript_selector =
161 
162 my $slice_adaptor = $dba->get_SliceAdaptor;
163 my $slices;
164 
165 if ($seq_region_name) {
166  $slices = [
167  $slice_adaptor->
168  fetch_by_region(
169  $coord_system_name,
170  $seq_region_name,
171  $include_non_ref,
172  $include_duplicates
173  )
174  ];
175 }
176 else {
177  if (!$coord_system_name) {
178  throw 'Requires a coordinate system name to function in this mode';
179  }
180  $slices = $slice_adaptor->
181  fetch_all($coord_system_name,
182  '',
183  $include_non_ref,
184  $include_duplicates
185  );
186 }
187 
188 my $canonical_changes = 0;
189 my $total_genes = 0;
190 
191 my @change_list;
192 
193 foreach my $slice (@$slices) {
194  my $genes = $slice->
195  get_all_Genes($logic_name, undef, 1);
196 
197  while (my $gene = shift @$genes) {
198  $total_genes++;
199  my $new_canonical = $transcript_selector->
200  select_canonical_transcript_for_Gene($gene);
201 
202  my $old_canonical = $gene->canonical_transcript;
203 
204  if ( !defined $old_canonical ) {
205  # Original canonical transcript is now absent, or never set.
206  if ($log_fh) {
207  print $log_fh "//\n";
208  print $log_fh "Old=[undef,undef,undef,undef,undef,undef,undef]\n";
209  printf $log_fh "New=[%s,%s,%s,%s,%s,%s,'%s']\n",
210  @{ $transcript_selector->encode_transcript($new_canonical) };
211  }
212  push @change_list, [ $gene->dbID, $new_canonical->dbID ];
213  $canonical_changes++;
214  }
215 
216  elsif ($new_canonical->dbID != $old_canonical->dbID) {
217  no warnings 'uninitialized';
218  printf
219  "%s (%s) changed transcript from %s (%s) to %s (%s)\n",
220  $gene->stable_id,
221  $gene->dbID,
222  $old_canonical->stable_id,
223  $old_canonical->dbID,
224  $new_canonical->stable_id,
225  $new_canonical->dbID;
226 
227  push @change_list, [ $gene->dbID, $new_canonical->dbID, $old_canonical->dbID ];
228  $canonical_changes++;
229 
230  if ($verbose) {
231  printf "Old transcript: [%s,%s,%s,%s,%s,%s,%s]\n",
232  @{ $transcript_selector->encode_transcript($old_canonical) };
233  printf "New transcript: [%s,%s,%s,%s,%s,%s,%s]\n",
234  @{ $transcript_selector->encode_transcript($new_canonical) };
235  }
236 
237  if ($log_fh) {
238  print $log_fh "//\n";
239  printf $log_fh "Old=[%s,%s,%s,%s,%s,%s,'%s']\n",
240  @{ $transcript_selector->encode_transcript($old_canonical) };
241  printf $log_fh "New=[%s,%s,%s,%s,%s,%s,'%s']\n",
242  @{ $transcript_selector->encode_transcript($new_canonical) };
243  }
244  }
245  }
246 }
247 
248 print
249  "Canonical transcript alterations: $canonical_changes ".
250  "from $total_genes genes\n";
251 if ($log_fh) {$log_fh->close}
252 
253 
254 
255 ## Change database entries.
256 if ($write) {
257  my $gene_update_sql = "UPDATE gene SET canonical_transcript_id = ? where gene_id = ?";
258  my $gene_sth = $dba->dbc->prepare($gene_update_sql);
259 
260  # Prepare transcript canonical attribute sqls
261  my $trans_select_sth = $dba->dbc->prepare("SELECT value FROM transcript_attrib WHERE transcript_id=? AND attrib_type_id=?");
262  my $trans_update_sth = $dba->dbc->prepare("UPDATE transcript_attrib SET value=? WHERE transcript_id=? AND attrib_type_id=?");
263  my $trans_insert_sth = $dba->dbc->prepare("INSERT INTO transcript_attrib (transcript_id, attrib_type_id, value) values(?,?,?)");
264  my $trans_delete_sth = $dba->dbc->prepare("DELETE FROM transcript_attrib WHERE transcript_id=? AND attrib_type_id=?");
265 
266  print "Updating database with new canonical transcripts...\n";
267  foreach my $change (@change_list) {
268  print "Changin' ". $change->[1]. " on ". $change->[0]."\n" if $verbose;
269  $gene_sth->execute( $change->[1], $change->[0]);
270 
271  # Check if new canonical transcript attribute exists
272  $trans_select_sth->execute($change->[1], $attrib_type_id);
273  if (my ($new_canonical_exists) = $trans_select_sth->fetchrow_array()) {
274  # Update new canonical transcript attribute
275  $trans_update_sth->execute(1, $change->[1], $attrib_type_id);
276  } else {
277  # Insert new canonical transcript attribute
278  $trans_insert_sth->execute($change->[1], $attrib_type_id, 1);
279  }
280 
281  # Check if old canonical transcript attribute exists
282  if (defined($change->[2])) {
283  $trans_select_sth->execute($change->[2], $attrib_type_id);
284  if (my ($old_canonical_exists) = $trans_select_sth->fetchrow_array()) {
285  # Delete old canonical transcript attribute
286  $trans_delete_sth->execute($change->[1], $attrib_type_id);
287  }
288  }
289  }
290  print "Done\n";
291 }
292 
293 print "Done\n";
294 
295 
296 
297 ## Subroutines
298 
300  my $dba = shift;
301  my $sql_command = "SELECT COUNT(*) FROM dna";
302 
303  my $sth = $dba->dbc->prepare($sql_command);
304  $sth->execute();
305 
306  my @dna_array = $sth->fetchrow_array;
307 
308  if ( $dna_array[0] > 0 ) {
309  print
310  "Your DB ". $dba->dbc->dbname. " contains DNA sequences. ".
311  "No need to attach a DNA_DB to it.\n"
312  if $verbose;
313  return 1;
314  }
315  else {
316  print
317  "Your DB " . $dba->dbc->dbname . " does not contain DNA sequences.\n"
318  if $verbose;
319  return 0;
320  }
321 }
322 
323 
324 ## POD anyone?
325 
326 sub usage {
327 print <<EOS
328 
329 Example usage: perl set_canonical_transcripts.pl -dbhost host -dbuser user
330  -dbpass *** -dbname dbname -dbport 3306 -coord_system toplevel -write
331 
332 Script options:
333 
334  -dbname Database name
335 
336  -dbhost Database host
337 
338  -dbport Database port
339 
340  -dbuser Database user
341 
342  -dbpass Database password
343 
344 Optional DB connection arguments:
345 
346  -dnadbname DNA Database name
347 
348  -dnadbhost DNA Database host
349 
350  -dnadbuser DNA Database user
351 
352  -dnadbport DNA Database port
353 
354  -dnadbpass DNA Database pass
355 
356  -ccdsdbname CCDS database name
357 
358  -ccdshost CCDS database host
359 
360  -ccdsuser CCDS database user
361 
362  -ccdspass CCDS database pass
363 
364  -ccdsport CCDS database port
365 
366 Other optional arguments:
367 
368  -coord_system_name Coordinate system to use
369 
370  -include_non_ref Specify if the non_reference regions should
371  be _excluded_. (default: include)
372 
373  -include_duplicates Specify if the duplicate regions should be
374  _included_. eg. Human PAR on Y (default: exclude)
375 
376  -seq_region_name Chromosome name if running a single seq_region
377 
378  -write Specify if results should be written to the database
379 
380  -verbose Increase verbosity of output messages
381 
382  -log Dump decision matrices into a log file for analysis
383 
384 To check the script has run correctly you can run the
385 CoreForeignKeys healthcheck:
386 
387 ./run-healthcheck.sh -d dbname -output problem CoreForeignKeys
388 
389 A warning about not using CCDS is perfectly acceptible when not
390 running on Human, Mouse and Zebrafish.
391 
392 EOS
393 ;
394 
395 }
check_if_DB_contains_DNA
public check_if_DB_contains_DNA()
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
Bio::EnsEMBL::Registry::no_cache_warnings
public Boolean no_cache_warnings()
Bio::EnsEMBL::Utils::TranscriptSelector::new
public Bio::EnsEMBL::Utils::TranscriptSelector new()
Bio::EnsEMBL::Registry
Definition: Registry.pm:113
about
public about()
Script
Definition: dump_mysql.pl:9
run
public run()
Bio::EnsEMBL::Utils::TranscriptSelector
Definition: TranscriptSelector.pm:30
usage
public usage()
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68