ensembl-hive  2.8.1
exon_conservation_check.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 use strict;
19 use warnings;
20 
21 use FindBin qw($Bin);
22 use lib "$Bin";
23 
24 #Bring in pipeline as well for their helpers
25 use lib "$Bin/../../../ensembl-pipeline/scripts/Finished/assembly";
26 use lib "$Bin/../../../ensembl-analysis/modules";
27 
28 #runtime include normally
29 require AssemblyMapper::Support;
31 use Bio::EnsEMBL::Utils::Exception qw( throw );
32 use Pod::Usage;
33 
34 #Genebuilder utils
35 require Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils;
36 Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils->import(qw/clone_Transcript/);
37 require Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonUtils;
38 Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonUtils->import(qw/clone_Exon/);
39 
40 my $support = AssemblyMapper::Support->new(
41  extra_options => [
42  qw/
43  check_transcripts!
44  check_exons!
45  /
46  ]
47 );
48 unless ($support->parse_arguments(@_)) {
49  warn $support->error if $support->error;
50  pod2usage(1);
51 }
52 $support->connect_dbs;
53 
54 my ($prod_host, $prod_user, $prod_pass, $prod_port);
55 if ($support->param('prod_host')) { $prod_host = $support->param('prod_host');
56 } else { $prod_host = $support->param('host'); }
57 
58 if ($support->param('prod_user')) { $prod_user = $support->param('prod_user');
59 } else { $prod_user = $support->param('user'); }
60 
61 if ($support->param('prod_pass')) { $prod_pass = $support->param('prod_pass');
62 } elsif ($support->param('prod_user')) {
63 # if prod_user is provided but not pass, there is no pass
64 } else {
65 # else, use same pass as for user/pass
66  $prod_pass = $support->param('pass'); }
67 
68 if ($support->param('prod_port')) { $prod_port = $support->param('prod_port');
69 } else { $prod_port = $support->param('port'); }
70 
71 
72 ###################################################################
73 #
74 # Get Biotype Manager
75 # Need to first get a production DBAdaptor
76 # Assumes the ensembl_production DB in on staging1
77 #
78 use Bio::EnsEMBL::Production::DBSQL::DBAdaptor;
79 my $prod_dba = Bio::EnsEMBL::Production::DBSQL::DBAdaptor->new(
80  -host => 'ens-staging1',
81  -user => 'ensro',
82  -port => 3306,
83  -dbname => 'ensembl_production',
84 );
85 $prod_dba or die "Cannot get a production DB adaptor";
86 my $biotype_manager = $prod_dba->get_biotype_manager();
87 
88 #
89 ###################################################################
90 
91 $support->log_stamped("Beginning analysis.\n");
92 $support->log("EXON KEY : !! = Very bad (pc mismatch), %% = Somewhat bad (mismatch), ?? = No mapping, might be bad, && = eval error\n");
93 $support->log("TRANSCRIPT KEY : @@ = Very bad (pc translation mismatch), ;; = Very bad (pc transcript mismatch), ** = Somewhat bad (mismatch), XX = No mapping, might be bad, ^^ = eval error\n");
94 
95 my $total_exons = 0;
96 my $total_transcripts = 0;
97 
98 $support->iterate_chromosomes(
99  prev_stage => '40-fix_overlaps',
100  this_stage => '41-conservation',
101  worker => \&compare,
102 );
103 
104 $support->log(sprintf("Total exons : %d\n", $total_exons));
105 $support->log(sprintf("Total transcripts : %d\n", $total_transcripts));
106 
107 $support->log_stamped("Finished.\n");
108 
109 sub compare {
110  my ($asp) = @_;
111  if ($support->param('check_exons')) {
112  compare_exons($asp);
113  }
114  if ($support->param('check_transcripts')) {
115  compare_transcripts($asp);
116  }
117  return 1;
118 }
119 
120 sub compare_exons {
121  my ($asp) = @_;
122 
123  my $R_chr = $asp->ref_chr;
124  my $A_chr = $asp->alt_chr;
125 
126  my $R_slice = $asp->ref_slice;
127  my $A_slice = $asp->alt_slice;
128 
129  my $new_slice_adaptor = $R_slice->adaptor();
130 
131  my $old_exons = $A_slice->get_all_Exons;
132 
133  $support->log(sprintf("Total exons %d\n", scalar(@{$old_exons})));
134 
135  while (my $old_exon = shift @$old_exons) {
136  $total_exons++;
137  eval {exon($old_exon, $R_slice, $new_slice_adaptor)};
138  if($@) {
139  $support->log(sprintf("&& | ID : %s | EVAL ERROR (%s)\n", $old_exon->stable_id(), $@));
140  }
141  }
142  return;
143 }
144 
145 sub exon {
146  my ($old_exon, $R_slice, $new_slice_adaptor) = @_;
147 
148  # Establish equivalent locations on old and new DBs
149  my $new_A_slice = new_Slice($old_exon, $new_slice_adaptor);
150 
151  # make a shadow exon for the new database
152  my $shadow_exon = clone_Exon($old_exon);
153  $shadow_exon->slice($new_A_slice);
154 
155  # project new exon to new assembly
156  my $projected_exon = $shadow_exon->transform($R_slice->coord_system->name, $R_slice->coord_system->version, $R_slice);
157 
158  # Note that Anacode database naming patterns interfere with normal Registry adaptor fetching,
159  # hence we must go around the houses somewhat when looking for the appropriate source gene.
160  #warn "!! fetching gene adaptor ".$old_exon->adaptor->species.":".$old_exon->adaptor->dbc->dbname."Gene";
161  my $old_gene_adaptor = $old_exon->adaptor->db->get_GeneAdaptor();
162  my $parent_gene = $old_gene_adaptor->fetch_by_exon_stable_id($old_exon->stable_id);
163 
164  my $state;
165  my $location;
166  my $difference;
167  my $total_length;
168 
169  # compare sequences if a projection exists
170  if ($projected_exon) {
171  my $old_seq = $old_exon->seq()->seq();
172  my $projected_seq = $projected_exon->seq()->seq();
173  $total_length = length($old_seq);
174 
175  if ($projected_seq ne $old_seq) {
176 
177  # Now we have a problem - the feature's sequence was not conserved between assemblies.
178  # Determine severity of the problem
179  $difference = diff(\$old_seq, \$projected_seq);
180 
181  my $biotype_group = $biotype_manager->fetch_biotype($parent_gene)->biotype_group;
182  if ($biotype_group eq 'coding') {
183  $state = '!!';
184  } else {
185  $state = '%%';
186  }
187  }
188 
189  $location = sprintf('%d : %d', $projected_exon->start(), $projected_exon->end());
190  }
191 
192  if (!$projected_exon) {
193  $state = '??';
194  $location = 'None';
195  $total_length = 0;
196  $difference = 0;
197  }
198 
199  if ($state) {
200  $support->log(sprintf('%s | ID : %s | Gene ID: %s | Location : %s | Differences : %d | Length : %d', $state, $old_exon->stable_id(), $parent_gene->stable_id(), $location, $difference, $total_length) . "\n");
201  }
202 
203  return;
204 }
205 
207  my ($asp) = @_;
208  my $R_slice = $asp->ref_slice;
209  my $A_slice = $asp->alt_slice;
210  my $new_slice_adaptor = $R_slice->adaptor();
211 
212  my $old_transcripts = $A_slice->get_all_Transcripts();
213  $support->log(sprintf("Total transcripts %d\n", scalar(@{$old_transcripts})));
214  while (my $old_transcript = shift @$old_transcripts) {
215  $total_transcripts++;
216  eval {transcript($old_transcript, $R_slice, $new_slice_adaptor)};
217  if($@) {
218  $support->log(sprintf("^^ | ID : %s | EVAL ERROR (%s)\n", $old_transcript->stable_id(), $@));
219  }
220  }
221 
222  return;
223 }
224 
225 sub transcript {
226  my ($old_transcript, $R_slice, $new_slice_adaptor) = @_;
227 
228  my $new_A_slice = new_Slice($old_transcript, $new_slice_adaptor);
229  my $shadow_transcript = clone_Transcript($old_transcript);
230  foreach my $e (@{$shadow_transcript->get_all_Exons()}) {
231  $e->slice($new_A_slice);
232  }
233  foreach my $se (@{$shadow_transcript->get_all_supporting_features()}) {
234  $se->slice($new_A_slice);
235  }
236  $shadow_transcript->slice($new_A_slice);
237  my $projected_transcript = $shadow_transcript->transform($R_slice->coord_system->name, $R_slice->coord_system->version, $R_slice);
238 
239  my $state;
240  my $location;
241  my $total_length;
242  my $difference;
243  if ($projected_transcript) {
244 
245  #Check if it was protein coding
246  my $biotype_group = $biotype_manager->fetch_biotype($projected_transcript)->biotype_group;
247 
248  my $is_pc = 0;
249  #Now check for protein sequence mis-match
250  if ($biotype_group eq 'coding') {
251  $is_pc = 1;
252  my $old_seq = $old_transcript->translate()->seq();
253  my $new_seq = $projected_transcript->translate()->seq();
254  $total_length = length($old_seq);
255  if ($old_seq ne $new_seq) {
256  $state = '@@';
257  $difference = diff(\$old_seq, \$new_seq);
258  }
259  }
260 
261  if (!$state) {
262  my $old_seq = $old_transcript->spliced_seq();
263  my $new_seq = $projected_transcript->spliced_seq();
264  $total_length = length($old_seq);
265  if ($old_seq ne $new_seq) {
266  $state = ($is_pc) ? ';;' : '**';
267  $difference = diff(\$old_seq, \$new_seq);
268  }
269  }
270 
271  $location = sprintf('%d : %d', $projected_transcript->start(), $projected_transcript->end());
272  }
273  else {
274  $state = 'XX';
275  $location = 'None';
276  $total_length = 0;
277  $difference = 0;
278  }
279 
280  if ($state) {
281  $support->log(sprintf('%s | ID : %s | Location : %s | Differences : %d | Length : %d', $state, $old_transcript->stable_id(), $location, $difference, $total_length) . "\n");
282  }
283 
284  return;
285 }
286 
287 sub new_Slice {
288  my ($feature, $new_slice_adaptor) = @_;
289  my $old_slice = $feature->slice();
290  my $coord_system = $old_slice->coord_system;
291  my $new_slice = $new_slice_adaptor->fetch_by_region($coord_system->name, $old_slice->seq_region_name, $old_slice->start, $old_slice->end, $old_slice->strand, $coord_system->version);
292  return $new_slice;
293 }
294 
295 #Count number of diffs between strings
296 sub diff {
297  my ($s1, $s2) = @_;
298  my $diff = ${$s1} ^ ${$s2};
299  my $total = 0;
300  $total += ($+[1] - $-[1]) while $diff =~ m{ ([^\x00]+) }xmsg;
301  return $total;
302 }
303 
304 __END__
305 
306 =pod
307 
308 =head1 NAME
309 
310 exon_conservation_check.pl
311 
312 =head1 SUMMARY
313 
314 This script is intended to highlight issues with an assembly mapping, by inspecting
315 the equivalent sequence for each exon. The resulting log is grep-suitable and keyed
316 for severity.
317 
318 =head1 SYNOPSIS
319 
320 perl exon_conservation_check.pl <many arguments>
321 
322  --dbname, db_name=NAME database name NAME
323  --host, --dbhost, --db_host=HOST database host HOST
324  --port, --dbport, --db_port=PORT database port PORT
325  --user, --dbuser, --db_user=USER database username USER
326  --pass, --dbpass, --db_pass=PASS database passwort PASS
327  --assembly=ASSEMBLY assembly version ASSEMBLY
328  --altdbname=NAME alternative database NAME
329  --altassembly=ASSEMBLY alternative assembly version ASSEMBLY
330  --prod_host database host for production database
331  --prod_user database user for production database
332  --prod_db database name for production database
333  --reg_conf registry configuration file
334 
335 Optional options
336  --logfile, --log=FILE log to FILE (default: *STDOUT)
337  --logpath=PATH write logfile to PATH (default: .)
338 
339  --check_exons Test exons for their robustness
340  between assembly mappings
341 
342  --check_transcripts Expand the test to check if we still
343  get full length transcript encoding
344 =cut
transcript
public transcript()
Bio::EnsEMBL::Registry
Definition: Registry.pm:113
exon
public exon()
diff
public diff()
compare_exons
public compare_exons()
new_Slice
public new_Slice()
compare_transcripts
public compare_transcripts()
compare
public compare()
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68