ensembl-hive  2.8.1
EnsemblTranscriptGeneric.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 =head1 SYNOPSIS
34 
35 =head1 DESCRIPTION
36 
37 =head1 METHODS
38 
39 =cut
40 
41 
42 package Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric;
43 
44 use strict;
45 use warnings;
46 no warnings 'uninitialized';
47 
50 
51 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
52 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
53 
54 
55 #
56 # basic mapping
57 #
58 sub init_basic {
59  my $self = shift;
60  my $num = shift;
61  my $tsb = shift;
62  my $mappings = shift;
63  my $transcript_scores = shift;
64 
65  $self->logger->info("Basic transcript mapping...\n", 0, 'stamped');
66 
67  $mappings = $self->basic_mapping($transcript_scores,
68  "transcript_mappings$num");
69  $num++;
70  my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
71  "transcript_matrix$num");
72 
73  return ($new_scores, $mappings);
74 }
75 
76 
77 #
78 # handle cases with exact match but different translation
79 #
80 sub non_exact_translation {
81  my $self = shift;
82  my $num = shift;
83  my $tsb = shift;
84  my $mappings = shift;
85  my $transcript_scores = shift;
86 
87  $self->logger->info("Exact Transcript non-exact Translation...\n", 0, 'stamped');
88 
89  unless ($transcript_scores->loaded) {
90  $tsb->different_translation_rescore($transcript_scores);
91  $transcript_scores->write_to_file;
92  }
93 
94  $mappings = $self->basic_mapping($transcript_scores,
95  "transcript_mappings$num");
96  $num++;
97  my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
98  "transcript_matrix$num");
99 
100  return ($new_scores, $mappings);
101 }
102 
103 
104 #
105 # reduce score for mappings of transcripts which do not belong to mapped
106 # genes
107 #
108 sub mapped_gene {
109  my $self = shift;
110  my $num = shift;
111  my $tsb = shift;
112  my $mappings = shift;
113  my $transcript_scores = shift;
114  my $gene_mappings = shift;
115 
116  $self->logger->info("Transcripts in mapped genes...\n", 0, 'stamped');
117 
118  unless ($transcript_scores->loaded) {
119  $tsb->non_mapped_gene_rescore($transcript_scores, $gene_mappings);
120  $transcript_scores->write_to_file;
121  }
122 
123  $mappings = $self->basic_mapping($transcript_scores,
124  "transcript_mappings$num");
125  $num++;
126  my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
127  "transcript_matrix$num");
128 
129  return ($new_scores, $mappings);
130 }
131 
132 #
133 # rescore by penalising scores between transcripts with different biotypes
134 #
135 sub biotype {
136  my $self = shift;
137  my $num = shift;
138  my $tsb = shift;
139  my $mappings = shift;
140  my $transcript_scores = shift;
141 
142  $self->logger->info( "Retry with biotype disambiguation...\n",
143  0, 'stamped' );
144 
145  unless ( $transcript_scores->loaded() ) {
146  $tsb->biotype_transcript_rescore($transcript_scores);
147  $transcript_scores->write_to_file();
148  }
149 
150  my $new_mappings = $self->basic_mapping( $transcript_scores,
151  "transcript_mappings$num" );
152  $num++;
153  my $new_scores =
154  $tsb->create_shrinked_matrix( $transcript_scores, $new_mappings,
155  "transcript_matrix$num" );
156 
157  return ( $new_scores, $new_mappings );
158 }
159 
160 #
161 # selectively rescore by penalising scores between transcripts with
162 # different internalIDs
163 #
164 sub internal_id {
165  my $self = shift;
166  my $num = shift;
167  my $tsb = shift;
168  my $mappings = shift;
169  my $transcript_scores = shift;
170 
171  $self->logger->info("Retry with internalID disambiguation...\n", 0, 'stamped');
172 
173  unless ($transcript_scores->loaded) {
174  $tsb->internal_id_rescore($transcript_scores);
175  $transcript_scores->write_to_file;
176  }
177 
178  $mappings = $self->basic_mapping($transcript_scores,
179  "transcript_mappings$num");
180  $num++;
181  my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
182  "transcript_matrix$num");
183 
184  return ($new_scores, $mappings);
185 }
186 
187 
188 #
189 # handle ambiguities between transcripts in single genes
190 #
191 sub single_gene {
192  my $self = shift;
193  my $num = shift;
194  my $tsb = shift;
195  my $mappings = shift;
196  my $transcript_scores = shift;
197 
198  $self->logger->info("Transcripts in single genes...\n", 0, 'stamped');
199 
200  unless ($transcript_scores->loaded) {
201  $transcript_scores->write_to_file;
202  }
203 
204  $mappings = $self->same_gene_transcript_mapping($transcript_scores,
205  "transcript_mappings$num");
206  $num++;
207  my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
208  "transcript_matrix$num");
209 
210  return ($new_scores, $mappings);
211 }
212 
213 
214 #
215 # modified basic mapper that maps transcripts that are ambiguous within one gene
216 #
217 sub same_gene_transcript_mapping {
218  my $self = shift;
219  my $matrix = shift;
220  my $mapping_name = shift;
221 
222  # argument checks
223  unless ($matrix and
224  $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
225  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
226  }
227 
228  throw('Need a name for serialising the mapping.') unless ($mapping_name);
229 
230  # Create a new MappingList object. Specify AUTO_LOAD to load serialised
231  # existing mappings if found
232  my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
233 
235  -DUMP_PATH => $dump_path,
236  -CACHE_FILE => "${mapping_name}.ser",
237  -AUTO_LOAD => 1,
238  );
239 
240  # checkpoint test: return a previously stored MappingList
241  if ($mappings->loaded) {
242  $self->logger->info("Read existing mappings from ${mapping_name}.ser.\n");
243  return $mappings;
244  }
245 
246  my $sources_done = {};
247  my $targets_done = {};
248 
249  # sort scoring matrix entries by descending score
250  my @sorted_entries = sort { $b->score <=> $a->score ||
251  $a->source <=> $b->source || $a->target <=> $b->target }
252  @{ $matrix->get_all_Entries };
253 
254  while (my $entry = shift(@sorted_entries)) {
255 
256  # $self->logger->debug("\nxxx4 ".$entry->to_string." ");
257 
258  # we already found a mapping for either source or target yet
259  next if ($sources_done->{$entry->source} or
260  $targets_done->{$entry->target});
261 
262  #$self->logger->debug('d');
263 
264  my $other_sources = [];
265  my $other_targets = [];
266  my %source_genes = ();
267  my %target_genes = ();
268 
269  if ($self->ambiguous_mapping($entry, $matrix, $other_sources, $other_targets)) {
270  #$self->logger->debug('a');
271 
272  $other_sources = $self->filter_sources($other_sources, $sources_done);
273  $other_targets = $self->filter_targets($other_targets, $targets_done);
274 
275  $source_genes{$self->cache->get_by_key('genes_by_transcript_id',
276  'source', $entry->source)} = 1;
277  $target_genes{$self->cache->get_by_key('genes_by_transcript_id',
278  'target', $entry->target)} = 1;
279 
280  foreach my $other_source (@{ $other_sources }) {
281  $source_genes{$self->cache->get_by_key('genes_by_transcript_id',
282  'source', $other_source)} = 1;
283  }
284 
285  foreach my $other_target (@{ $other_targets }) {
286  $target_genes{$self->cache->get_by_key('genes_by_transcript_id',
287  'target', $other_target)} = 1;
288  }
289 
290  # only add mapping if only one source and target gene involved
291  if (scalar(keys %source_genes) == 1 and scalar(keys %target_genes) == 1) {
292  #$self->logger->debug('O');
293  $mappings->add_Entry($entry);
294  }
295 
296  } else {
297  #$self->logger->debug('A');
298 
299  # this is the best mapping, add it
300  $mappings->add_Entry($entry);
301  }
302 
303  $sources_done->{$entry->source} = 1;
304  $targets_done->{$entry->target} = 1;
305  }
306 
307  # create checkpoint
308  $mappings->write_to_file;
309 
310  return $mappings;
311 }
312 
313 
314 1;
315 
Bio::EnsEMBL::Utils::ScriptUtils
Definition: ScriptUtils.pm:11
Bio::EnsEMBL::IdMapping::MappingList::new
public Bio::EnsEMBL::IdMapping::MappingList new()
Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper
Definition: BaseMapper.pm:17
Bio::EnsEMBL::IdMapping::MappingList::get_all_Entries
public Arrayref get_all_Entries()
Bio::EnsEMBL::IdMapping::MappingList
Definition: MappingList.pm:38
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68