2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
24 #Bring in pipeline as well for their helpers
25 use lib
"$Bin/../../../ensembl-pipeline/scripts/Finished/assembly";
26 use lib
"$Bin/../../../ensembl-analysis/modules";
28 #runtime include normally
29 require AssemblyMapper::Support;
35 require Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils;
36 Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::TranscriptUtils->import(qw/clone_Transcript/);
37 require Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonUtils;
38 Bio::EnsEMBL::Analysis::Tools::GeneBuildUtils::ExonUtils->import(qw/clone_Exon/);
40 my $support = AssemblyMapper::Support->new(
48 unless ($support->parse_arguments(@_)) {
49 warn $support->error
if $support->error;
52 $support->connect_dbs;
54 my ($prod_host, $prod_user, $prod_pass, $prod_port);
55 if ($support->param(
'prod_host')) { $prod_host = $support->param(
'prod_host');
56 }
else { $prod_host = $support->param(
'host'); }
58 if ($support->param(
'prod_user')) { $prod_user = $support->param(
'prod_user');
59 }
else { $prod_user = $support->param(
'user'); }
61 if ($support->param(
'prod_pass')) { $prod_pass = $support->param(
'prod_pass');
62 } elsif ($support->param(
'prod_user')) {
63 # if prod_user is provided but not pass, there is no pass
65 # else, use same pass as for user/pass
66 $prod_pass = $support->param(
'pass'); }
68 if ($support->param(
'prod_port')) { $prod_port = $support->param(
'prod_port');
69 }
else { $prod_port = $support->param(
'port'); }
72 ###################################################################
75 # Need to first get a production DBAdaptor
76 # Assumes the ensembl_production DB in on staging1
78 use Bio::EnsEMBL::Production::DBSQL::DBAdaptor;
79 my $prod_dba = Bio::EnsEMBL::Production::DBSQL::DBAdaptor->new(
80 -host =>
'ens-staging1',
83 -dbname =>
'ensembl_production',
85 $prod_dba or die
"Cannot get a production DB adaptor";
86 my $biotype_manager = $prod_dba->get_biotype_manager();
89 ###################################################################
91 $support->log_stamped(
"Beginning analysis.\n");
92 $support->log(
"EXON KEY : !! = Very bad (pc mismatch), %% = Somewhat bad (mismatch), ?? = No mapping, might be bad, && = eval error\n");
93 $support->log(
"TRANSCRIPT KEY : @@ = Very bad (pc translation mismatch), ;; = Very bad (pc transcript mismatch), ** = Somewhat bad (mismatch), XX = No mapping, might be bad, ^^ = eval error\n");
96 my $total_transcripts = 0;
98 $support->iterate_chromosomes(
99 prev_stage =>
'40-fix_overlaps',
100 this_stage =>
'41-conservation',
104 $support->log(sprintf(
"Total exons : %d\n", $total_exons));
105 $support->log(sprintf(
"Total transcripts : %d\n", $total_transcripts));
107 $support->log_stamped(
"Finished.\n");
111 if ($support->param(
'check_exons')) {
114 if ($support->param(
'check_transcripts')) {
123 my $R_chr = $asp->ref_chr;
124 my $A_chr = $asp->alt_chr;
126 my $R_slice = $asp->ref_slice;
127 my $A_slice = $asp->alt_slice;
129 my $new_slice_adaptor = $R_slice->adaptor();
131 my $old_exons = $A_slice->get_all_Exons;
133 $support->log(sprintf(
"Total exons %d\n", scalar(@{$old_exons})));
135 while (my $old_exon = shift @$old_exons) {
137 eval {
exon($old_exon, $R_slice, $new_slice_adaptor)};
139 $support->log(sprintf(
"&& | ID : %s | EVAL ERROR (%s)\n", $old_exon->stable_id(), $@));
146 my ($old_exon, $R_slice, $new_slice_adaptor) = @_;
148 # Establish equivalent locations on old and new DBs
149 my $new_A_slice =
new_Slice($old_exon, $new_slice_adaptor);
151 # make a shadow exon for the new database
152 my $shadow_exon = clone_Exon($old_exon);
153 $shadow_exon->slice($new_A_slice);
155 # project new exon to new assembly
156 my $projected_exon = $shadow_exon->transform($R_slice->coord_system->name, $R_slice->coord_system->version, $R_slice);
158 # Note that Anacode database naming patterns interfere with normal Registry adaptor fetching,
159 # hence we must go around the houses somewhat when looking for the appropriate source gene.
160 #warn "!! fetching gene adaptor ".$old_exon->adaptor->species.":".$old_exon->adaptor->dbc->dbname."Gene";
161 my $old_gene_adaptor = $old_exon->adaptor->db->get_GeneAdaptor();
162 my $parent_gene = $old_gene_adaptor->fetch_by_exon_stable_id($old_exon->stable_id);
169 # compare sequences if a projection exists
170 if ($projected_exon) {
171 my $old_seq = $old_exon->seq()->seq();
172 my $projected_seq = $projected_exon->seq()->seq();
173 $total_length = length($old_seq);
175 if ($projected_seq ne $old_seq) {
177 # Now we have a problem - the feature's sequence was not conserved between assemblies.
178 # Determine severity of the problem
179 $difference =
diff(\$old_seq, \$projected_seq);
181 my $biotype_group = $biotype_manager->fetch_biotype($parent_gene)->biotype_group;
182 if ($biotype_group eq
'coding') {
189 $location = sprintf(
'%d : %d', $projected_exon->start(), $projected_exon->end());
192 if (!$projected_exon) {
200 $support->log(sprintf(
'%s | ID : %s | Gene ID: %s | Location : %s | Differences : %d | Length : %d', $state, $old_exon->stable_id(), $parent_gene->stable_id(), $location, $difference, $total_length) .
"\n");
208 my $R_slice = $asp->ref_slice;
209 my $A_slice = $asp->alt_slice;
210 my $new_slice_adaptor = $R_slice->adaptor();
212 my $old_transcripts = $A_slice->get_all_Transcripts();
213 $support->log(sprintf(
"Total transcripts %d\n", scalar(@{$old_transcripts})));
214 while (my $old_transcript = shift @$old_transcripts) {
215 $total_transcripts++;
216 eval {
transcript($old_transcript, $R_slice, $new_slice_adaptor)};
218 $support->log(sprintf(
"^^ | ID : %s | EVAL ERROR (%s)\n", $old_transcript->stable_id(), $@));
226 my ($old_transcript, $R_slice, $new_slice_adaptor) = @_;
228 my $new_A_slice =
new_Slice($old_transcript, $new_slice_adaptor);
229 my $shadow_transcript = clone_Transcript($old_transcript);
230 foreach my $e (@{$shadow_transcript->get_all_Exons()}) {
231 $e->slice($new_A_slice);
233 foreach my $se (@{$shadow_transcript->get_all_supporting_features()}) {
234 $se->slice($new_A_slice);
236 $shadow_transcript->slice($new_A_slice);
237 my $projected_transcript = $shadow_transcript->transform($R_slice->coord_system->name, $R_slice->coord_system->version, $R_slice);
243 if ($projected_transcript) {
245 #Check if it was protein coding
246 my $biotype_group = $biotype_manager->fetch_biotype($projected_transcript)->biotype_group;
249 #Now check for protein sequence mis-match
250 if ($biotype_group eq
'coding') {
252 my $old_seq = $old_transcript->translate()->seq();
253 my $new_seq = $projected_transcript->translate()->seq();
254 $total_length = length($old_seq);
255 if ($old_seq ne $new_seq) {
257 $difference =
diff(\$old_seq, \$new_seq);
262 my $old_seq = $old_transcript->spliced_seq();
263 my $new_seq = $projected_transcript->spliced_seq();
264 $total_length = length($old_seq);
265 if ($old_seq ne $new_seq) {
266 $state = ($is_pc) ?
';;' :
'**';
267 $difference =
diff(\$old_seq, \$new_seq);
271 $location = sprintf(
'%d : %d', $projected_transcript->start(), $projected_transcript->end());
281 $support->log(sprintf(
'%s | ID : %s | Location : %s | Differences : %d | Length : %d', $state, $old_transcript->stable_id(), $location, $difference, $total_length) .
"\n");
288 my ($feature, $new_slice_adaptor) = @_;
289 my $old_slice = $feature->slice();
290 my $coord_system = $old_slice->coord_system;
291 my $new_slice = $new_slice_adaptor->fetch_by_region($coord_system->name, $old_slice->seq_region_name, $old_slice->start, $old_slice->end, $old_slice->strand, $coord_system->version);
295 #Count number of diffs between strings
298 my $diff = ${$s1} ^ ${$s2};
300 $total += ($+[1] - $-[1])
while $diff =~ m{ ([^\x00]+) }xmsg;
310 exon_conservation_check.pl
314 This script is intended to highlight issues with an assembly mapping, by inspecting
315 the equivalent sequence
for each exon. The resulting log is grep-suitable and keyed
320 perl exon_conservation_check.pl <many arguments>
322 --dbname, db_name=NAME database name NAME
323 --host, --dbhost, --db_host=HOST database host HOST
324 --port, --dbport, --db_port=PORT database port PORT
325 --user, --dbuser, --db_user=USER database username USER
326 --pass, --dbpass, --db_pass=PASS database passwort PASS
327 --assembly=ASSEMBLY assembly version ASSEMBLY
328 --altdbname=NAME alternative database NAME
329 --altassembly=ASSEMBLY alternative assembly version ASSEMBLY
330 --prod_host database host
for production database
331 --prod_user database user
for production database
332 --prod_db database name
for production database
333 --reg_conf registry configuration file
336 --logfile, --log=FILE log to FILE (
default: *STDOUT)
337 --logpath=PATH write logfile to PATH (
default: .)
339 --check_exons Test exons
for their robustness
340 between assembly mappings
342 --check_transcripts Expand the test to check
if we still