2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
17 # Script that selects the best candidate for canonical transcripts on
20 # For usage instructions, run ./select_canonical_transcripts.pl -help
34 my ($host, $port, $dbname, $user, $pass);
35 my ($dnahost, $dnaport, $dnadbname, $dnauser, $dnapass);
36 my ($ccds_host, $ccds_dbname, $ccds_user, $ccds_port, $ccds_pass);
37 my ($log_path, $help);
41 my $coord_system_name;
44 # keep as undefined unless you only want to run on a specific analysis
48 my $include_non_ref = 1;
49 my $include_duplicates;
53 GetOptions(
'dbhost:s' => \$host,
55 'dbname:s' => \$dbname,
58 'dnadbhost:s' => \$dnahost,
59 'dnadbname:s' => \$dnadbname,
60 'dnadbuser:s' => \$dnauser,
61 'dnadbpass:s' => \$dnapass,
62 'dnadbport:s' => \$dnaport,
63 'ccdshost:s' => \$ccds_host,
64 'ccdsdbname:s' => \$ccds_dbname,
65 'ccdsuser:s' => \$ccds_user,
66 'ccdsport:s' => \$ccds_port,
67 'ccdspass:s' => \$ccds_pass,
69 'coord_system_name:s' => \$coord_system_name,
70 'seq_region_name:s' => \$seq_region_name,
72 'logic_name:s' => \$logic_name,
74 'include_non_ref!' => \$include_non_ref,
75 'include_duplicates' => \$include_duplicates,
76 'verbose!' => \$verbose,
78 # log file used
for analysing choices in bulk
79 'log:s' => \$log_path,
82 ) or die
"check options\n";
84 if ($help) { &
usage; exit; }
93 -species =>
'default',
98 # Get attribute id for canonical
99 my $attrib_type_id_sth = $dba->dbc->prepare(
"SELECT attrib_type_id FROM attrib_type WHERE code=?");
100 $attrib_type_id_sth->execute(
'is_canonical');
101 ($attrib_type_id) = $attrib_type_id_sth->fetchrow_array();
102 if (!$attrib_type_id) {
103 throw (
"No attrib_type_id found for 'is_canonical' attribute in attrib_type table");
106 print
"You have not used the -write option "
107 .
"so results will not be written into the database\n";
111 if(!$dnauser || !$dnahost) {
112 throw (
"You must provide user, host and dbname details ".
113 "to connect to DNA DB!");
120 -dbname => $dnadbname,
122 -species =>
'dna_'.$dba->species(),
124 $dba->dnadb($dna_db);
129 throw (
"Your gene DB contains no DNA. ".
130 "You must provide a DNA_DB to connect to");
137 if (!$ccds_user || !$ccds_host) {
138 throw (
"You must provide user, host and dbname details ".
139 "to connect to CCDS DB!");
147 -dbname => $ccds_dbname,
148 -species =>
'ccds_'.$dba->species(),
155 $log_fh = IO::File->new($log_path,
"w")
156 or
throw (
"Could not open logging file.");
159 my $transcript_selector =
162 my $slice_adaptor = $dba->get_SliceAdaptor;
165 if ($seq_region_name) {
177 if (!$coord_system_name) {
178 throw 'Requires a coordinate system name to function in this mode';
180 $slices = $slice_adaptor->
181 fetch_all($coord_system_name,
188 my $canonical_changes = 0;
193 foreach my $slice (@$slices) {
195 get_all_Genes($logic_name, undef, 1);
197 while (my $gene = shift @$genes) {
199 my $new_canonical = $transcript_selector->
200 select_canonical_transcript_for_Gene($gene);
202 my $old_canonical = $gene->canonical_transcript;
204 if ( !defined $old_canonical ) {
205 # Original canonical transcript is now absent, or never set.
207 print $log_fh
"//\n";
208 print $log_fh
"Old=[undef,undef,undef,undef,undef,undef,undef]\n";
209 printf $log_fh
"New=[%s,%s,%s,%s,%s,%s,'%s']\n",
210 @{ $transcript_selector->encode_transcript($new_canonical) };
212 push @change_list, [ $gene->dbID, $new_canonical->dbID ];
213 $canonical_changes++;
216 elsif ($new_canonical->dbID != $old_canonical->dbID) {
217 no warnings
'uninitialized';
219 "%s (%s) changed transcript from %s (%s) to %s (%s)\n",
222 $old_canonical->stable_id,
223 $old_canonical->dbID,
224 $new_canonical->stable_id,
225 $new_canonical->dbID;
227 push @change_list, [ $gene->dbID, $new_canonical->dbID, $old_canonical->dbID ];
228 $canonical_changes++;
231 printf
"Old transcript: [%s,%s,%s,%s,%s,%s,%s]\n",
232 @{ $transcript_selector->encode_transcript($old_canonical) };
233 printf
"New transcript: [%s,%s,%s,%s,%s,%s,%s]\n",
234 @{ $transcript_selector->encode_transcript($new_canonical) };
238 print $log_fh
"//\n";
239 printf $log_fh
"Old=[%s,%s,%s,%s,%s,%s,'%s']\n",
240 @{ $transcript_selector->encode_transcript($old_canonical) };
241 printf $log_fh
"New=[%s,%s,%s,%s,%s,%s,'%s']\n",
242 @{ $transcript_selector->encode_transcript($new_canonical) };
249 "Canonical transcript alterations: $canonical_changes ".
250 "from $total_genes genes\n";
251 if ($log_fh) {$log_fh->close}
255 ## Change database entries.
257 my $gene_update_sql =
"UPDATE gene SET canonical_transcript_id = ? where gene_id = ?";
258 my $gene_sth = $dba->dbc->prepare($gene_update_sql);
260 # Prepare transcript canonical attribute sqls
261 my $trans_select_sth = $dba->dbc->prepare(
"SELECT value FROM transcript_attrib WHERE transcript_id=? AND attrib_type_id=?");
262 my $trans_update_sth = $dba->dbc->prepare(
"UPDATE transcript_attrib SET value=? WHERE transcript_id=? AND attrib_type_id=?");
263 my $trans_insert_sth = $dba->dbc->prepare(
"INSERT INTO transcript_attrib (transcript_id, attrib_type_id, value) values(?,?,?)");
264 my $trans_delete_sth = $dba->dbc->prepare(
"DELETE FROM transcript_attrib WHERE transcript_id=? AND attrib_type_id=?");
266 print
"Updating database with new canonical transcripts...\n";
267 foreach my $change (@change_list) {
268 print
"Changin' ". $change->[1].
" on ". $change->[0].
"\n" if $verbose;
269 $gene_sth->execute( $change->[1], $change->[0]);
271 # Check if new canonical transcript attribute exists
272 $trans_select_sth->execute($change->[1], $attrib_type_id);
273 if (my ($new_canonical_exists) = $trans_select_sth->fetchrow_array()) {
274 # Update new canonical transcript attribute
275 $trans_update_sth->execute(1, $change->[1], $attrib_type_id);
277 # Insert new canonical transcript attribute
278 $trans_insert_sth->execute($change->[1], $attrib_type_id, 1);
281 # Check if old canonical transcript attribute exists
282 if (defined($change->[2])) {
283 $trans_select_sth->execute($change->[2], $attrib_type_id);
284 if (my ($old_canonical_exists) = $trans_select_sth->fetchrow_array()) {
285 # Delete old canonical transcript attribute
286 $trans_delete_sth->execute($change->[1], $attrib_type_id);
301 my $sql_command =
"SELECT COUNT(*) FROM dna";
303 my $sth = $dba->dbc->prepare($sql_command);
306 my @dna_array = $sth->fetchrow_array;
308 if ( $dna_array[0] > 0 ) {
310 "Your DB ". $dba->dbc->dbname.
" contains DNA sequences. ".
311 "No need to attach a DNA_DB to it.\n"
317 "Your DB " . $dba->dbc->dbname .
" does not contain DNA sequences.\n"
329 Example
usage: perl set_canonical_transcripts.pl -dbhost host -dbuser user
330 -dbpass *** -dbname dbname -dbport 3306 -coord_system toplevel -write
334 -dbname Database name
336 -dbhost Database host
338 -dbport Database port
340 -dbuser Database user
342 -dbpass Database password
344 Optional DB connection arguments:
346 -dnadbname DNA Database name
348 -dnadbhost DNA Database host
350 -dnadbuser DNA Database user
352 -dnadbport DNA Database port
354 -dnadbpass DNA Database pass
356 -ccdsdbname CCDS database name
358 -ccdshost CCDS database host
360 -ccdsuser CCDS database user
362 -ccdspass CCDS database pass
364 -ccdsport CCDS database port
366 Other optional arguments:
368 -coord_system_name Coordinate system to use
370 -include_non_ref Specify
if the non_reference regions should
371 be _excluded_. (
default: include)
373 -include_duplicates Specify
if the duplicate regions should be
374 _included_. eg. Human PAR on Y (
default: exclude)
376 -seq_region_name Chromosome name
if running a single seq_region
378 -write Specify
if results should be written to the database
380 -verbose Increase verbosity of output messages
382 -log Dump decision matrices into a log file
for analysis
384 To check the script has
run correctly you can
run the
385 CoreForeignKeys healthcheck:
387 ./
run-healthcheck.sh -d dbname -output problem CoreForeignKeys
389 A warning
about not
using CCDS is perfectly acceptible when not
390 running on Human, Mouse and Zebrafish.