3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 Copyright [2016-2024] EMBL-European Bioinformatics Institute
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
23 Please email comments or questions to the
public Ensembl
24 developers list at <http:
26 Questions may also be sent to the Ensembl help desk at
37 # get a result analyser
45 $analyser->analyse( $gene_mappings,
46 $stable_id_mapper->get_all_stable_id_events(
'similarity') );
48 # write results to file
49 $analyser->write_results_to_file;
52 $analyser->create_clicklist;
55 $analyser->create_mapping_summary;
59 This is a utility module which analyses the stable Id mapping results
60 by providing various sorts of mapping statistics. It also creates
61 clicklists and a mapping summary.
67 classify_source_genes_by_type
68 classify_genes_by_mapping_simple
69 classify_genes_by_mapping
80 create_mapping_summary
86 package Bio::EnsEMBL::IdMapping::ResultAnalyser;
90 no warnings
'uninitialized';
103 Arg[2] : Arrayref of Strings - similarity events
104 Example : $analyser->analyse($gene_mappings,
105 $stable_id_mapper->get_all_stable_id_events(
'similarity'));
106 Description : Analyses the results of a stable Id mapping
run.
108 Exceptions : thrown on wrong or missing arguments
117 my $gene_mappings = shift;
118 my $similarity_events = shift;
121 unless ($gene_mappings and
122 $gene_mappings->isa(
'Bio::EnsEMBL::IdMapping::MappingList')) {
123 throw(
"Need a Bio::EnsEMBL::IdMapping::MappingList of genes.");
126 unless ($similarity_events and ref($similarity_events) eq
'ARRAY') {
127 throw(
"Need a list of similarity events.");
130 # classify source genes by type (logic_name-biotype)
131 $self->classify_source_genes_by_type;
133 # classify source genes by mapping status
134 $self->classify_genes_by_mapping($gene_mappings, $similarity_events);
138 =head2 classify_source_genes_by_type
140 Example : $analyser->classify_source_genes_by_type;
141 Description : Classifies source genes by type and adds them to the
internal
142 datastructure. For the format of the classification
string see
152 sub classify_source_genes_by_type {
155 foreach my $s_gene (values %{ $self->cache->get_by_name(
'genes_by_id',
'source') }) {
156 $self->add(
'source', $self->class_key($s_gene),
'all', $s_gene->stable_id);
161 =head2 classify_genes_by_mapping_simple
163 Arg[1] : Bio::EnsEMBL::IdMapping::MapppingList $gene_mappings - gene
165 Example : $analyser->classify_genes_by_mapping_simple;
166 Description : Classifies target genes by mapping (
'mapped' or
'unmapped').
168 Exceptions : thrown on wrong or missing argument
169 Caller : This method is not in use at the momen.
175 sub classify_genes_by_mapping_simple {
177 my $gene_mappings = shift;
180 unless ($gene_mappings and
181 $gene_mappings->isa(
'Bio::EnsEMBL::IdMapping::MappingList')) {
182 throw(
"Need a Bio::EnsEMBL::IdMapping::MappingList of genes.");
187 # firrst, create a lookup hash of source genes by target internal ID
188 my %source_genes_by_target = ();
189 foreach my $e (@{ $gene_mappings->get_all_Entries }) {
190 my $s_gene = $self->cache->get_by_key(
'genes_by_id',
'source', $e->source);
191 my $t_gene = $self->cache->get_by_key(
'genes_by_id',
'target', $e->target);
192 $source_genes_by_target{$t_gene->id} = $s_gene;
195 # now loop over target genes
196 foreach my $t_gene (values %{ $self->cache->get_by_name(
'genes_by_id',
'target') }) {
198 # check if target gene has all required properties set
199 unless ($t_gene->logic_name and $t_gene->biotype) {
200 $self->logger->warning(
"Missing data for target gene: ".
201 $t_gene->to_string.
"\n", 1);
204 my $class = $self->class_key($t_gene);
206 # classify as '1' if mapped (using source gene's stable ID), otherwise '0'
207 if (my $s_gene = $source_genes_by_target{$t_gene->id}) {
208 $self->add(
'target', $class,
'mapped', $s_gene->stable_id);
210 $self->add(
'target', $class,
'unmapped', $t_gene->stable_id);
217 =head2 classify_genes_by_mapping
219 Arg[1] : Bio::EnsEMBL::IdMapping::MapppingList $gene_mappings - gene
221 Arg[2] : Arrayref of Strings - similarity events
222 Example : $analyser->classify_genes_by_mapping;
223 Description : Classifies genes by mapping. Status is
224 'mapped' => stable Id was mapped
225 'lost_similar' => stable Id not mapped, but there is a
226 similarity entry
for the source Id
227 'lost_definite' => not mapped and no similarity
229 Exceptions : thrown on wrong or missing argument
230 Caller : This method is not in use at the momen.
236 sub classify_genes_by_mapping {
238 my $gene_mappings = shift;
239 my $similarity_events = shift;
242 unless ($gene_mappings and
243 $gene_mappings->isa(
'Bio::EnsEMBL::IdMapping::MappingList')) {
244 throw(
"Need a Bio::EnsEMBL::IdMapping::MappingList of genes.");
247 unless ($similarity_events and ref($similarity_events) eq
'ARRAY') {
248 throw(
"Need a list of similarity events.");
252 foreach my $e (@{ $gene_mappings->get_all_Entries }) {
253 my $s_gene = $self->cache->get_by_key(
'genes_by_id',
'source', $e->source);
254 $self->add(
'source', $self->class_key($s_gene),
'mapped',
258 # lookup hash for similarities
260 foreach my $event (@{ $similarity_events }) {
261 my ($stable_id) = split(
"\t", $event);
262 $similar{$stable_id} = 1;
266 foreach my $s_gene (values %{ $self->cache->get_by_name(
'genes_by_id',
'source') }) {
268 my $stable_id = $s_gene->stable_id;
269 my $class = $self->class_key($s_gene);
271 unless ($self->get(
'source', $class,
'mapped', $stable_id)) {
273 # sub-classify as 'lost_similar' or 'lost_definite'
274 if ($similar{$stable_id}) {
275 $self->add(
'source', $class,
'lost_similar', $stable_id);
277 $self->add(
'source', $class,
'lost_definite', $stable_id);
288 Arg[1] : String $dbtype - db type (
'source' or
'target')
289 Arg[2] : String $class - key identifying a gene type (see class_key())
290 Arg[3] : String $subclass - status identifier (e.g.
'mapped',
'lost')
291 Arg[4] : String $stable_id - gene stable Id
292 Arg[5] : String $val - value (usually 0 or 1)
293 Example : $analyser->add(
'source',
'KNOWN-ensembl-protein_coding',
294 'mapped',
'ENSG00002342', 1);
295 Description : Add a stable Id /
property pair to a name/dbtype lookup hash.
297 The datastructure is a bit of a bloat, but is general enough to
298 be used as a lookup hash and to generate statistics (counts by
299 type) and
debug lists (dump by type).
300 Return type : String - the added value
309 my ($self, $dbtype, $class, $subclass, $stable_id, $val) = @_;
311 # private method, so no argument check done for performance reasons
313 # default to a value of '1'
314 $val = 1 unless (defined($val));
316 $self->{$dbtype}->{$class}->{$subclass}->{$stable_id} = $val;
322 Arg[1] : String $dbtype - db type (
'source' or
'target')
323 Arg[2] : String $class - key identifying a gene type (see class_key())
324 Arg[3] : String $subclass - status identifier (e.g. 'mapped', 'lost')
325 Arg[4] : String $stable_id - gene stable Id
326 Example : my $mapping_status = $analyser->get('source',
327 'KNOWN-ensembl-protein_coding', 'mapped', 'ENSG00002342');
328 Description : Gets a stable Id mapping status from the internal datastructure.
338 my ($self, $dbtype, $class, $subclass, $stable_id) = @_;
340 # private method, so no argument check done for performance reasons
342 return $self->{$dbtype}->{$class}->{$subclass}->{$stable_id};
346 =head2 get_all_by_subclass
348 Arg[1] : String $dbtype - db type (
'source' or
'target')
349 Arg[2] : String $class - key identifying a gene type (see class_key())
350 Arg[3] : String $subclass - status identifier (e.g. 'mapped', 'lost')
351 Example : my @mapped_stable_ids = @{
352 $analyser->get_all_by_subclass(
353 'source',
'KNOWN-ensembl-protein_coding',
356 Description : Gets a list of stable Id
for a given subclass.
357 Return type : Arrayref of String (stable Ids)
358 Exceptions : thrown on missing arguments
365 sub get_all_by_subclass {
366 my ($self, $dbtype, $class, $subclass) = @_;
369 throw(
"Need a dbtype (source|target).") unless ($dbtype);
370 throw(
"Need a class.") unless ($class);
371 throw(
"Need a subclass.") unless ($subclass);
373 return [ keys %{ $self->{$dbtype}->{$class}->{$subclass} || {} } ];
377 =head2 get_all_by_class
379 Arg[1] : String $dbtype - db type (
'source' or
'target')
380 Arg[2] : String $class - key identifying a gene type (see class_key())
381 Example : my @stable_ids = @{
382 $analyser->get_all_by_class(
'source',
383 'KNOWN-ensembl-protein_coding' ) };
384 Description : Gets a list of stable Id
for a given
class.
385 Return type : Arrayref of String (stable Ids)
386 Exceptions : thrown on missing arguments
393 sub get_all_by_class {
394 my ($self, $dbtype, $class) = @_;
397 throw(
"Need a dbtype (source|target).") unless ($dbtype);
398 throw(
"Need a class.") unless ($class);
402 foreach my $subclass (keys %{ $self->{$dbtype}->{$class} || {} }) {
403 while (my ($key, $val) = each(%{ $self->{$dbtype}->{$class}->{$subclass} })) {
404 $merged{$key} = $val;
408 return [ keys %merged ];
412 =head2 get_count_by_subclass
414 Arg[1] : String $dbtype - db type (
'source' or
'target')
415 Arg[2] : String $class - key identifying a gene type (see class_key())
416 Arg[3] : String $subclass - status identifier (e.g. 'mapped', 'lost')
417 Example : my $num_mapped = $analyser->get_count_by_subclass('source',
418 'KNOWN-ensembl-protein_coding', 'mapped');
419 Description : Gets the number of stable Ids for a given subclass.
421 Exceptions : thrown on missing arguments
428 sub get_count_by_subclass {
429 my ($self, $dbtype, $class, $subclass) = @_;
432 throw(
"Need a dbtype (source|target).") unless ($dbtype);
433 throw(
"Need a class.") unless ($class);
434 throw(
"Need a subclass.") unless ($subclass);
436 return scalar(keys %{ $self->{$dbtype}->{$class}->{$subclass} || {} });
440 =head2 get_count_by_class
442 Arg[1] : String $dbtype - db type (
'source' or
'target')
443 Arg[2] : String $class - key identifying a gene type (see class_key())
444 Example : my $num_mapped = $analyser->get_count_by_class('source',
445 'KNOWN-ensembl-protein_coding');
446 Description : Gets the number of stable Ids for a given class.
448 Exceptions : thrown on missing arguments
455 sub get_count_by_class {
456 my ($self, $dbtype, $class) = @_;
459 throw(
"Need a dbtype (source|target).") unless ($dbtype);
460 throw(
"Need a class.") unless ($class);
462 return scalar(@{ $self->get_all_by_class($dbtype, $class) });
466 =head2 get_all_classes
468 Arg[1] : String $dbtype - db type (
'source' or
'target')
469 Example :
foreach my $class (@{ $analyser->get_all_classes(
'source') }) {
472 Description : Gets a list of classes in the ResultAnalyser.
473 Return type : Arrayref of String
474 Exceptions : thrown on missing argument
481 sub get_all_classes {
482 my ($self, $dbtype) = @_;
485 throw(
"Need a dbtype (source|target).") unless ($dbtype);
487 return [ sort keys %{ $self->{$dbtype} || {} } ];
494 Example : my $class = $analyser->class_key($gene);
495 Description : Generates a key identifying a gene
class. This identifier is
496 composed from the gene
's logic naame, and biotye.
506 my ($self, $gene) = @_;
507 return join('-
', map { $gene->$_ } qw(logic_name biotype));
511 =head2 write_results_to_file
513 Example : $analyser->write_results_to_file;
514 Description : Writes the results of the result analysis to a file. This is a
515 human-readable text detailing the mapping statistics.
524 sub write_results_to_file {
527 my $fh = $self->get_filehandle('gene_detailed_mapping_stats.txt
', 'stats
');
529 my $fmt1 = "%-60s%-16s%-16s%-16s\n";
530 my $fmt2 = "%-60s%5.0f (%7s) %5.0f (%7s) %5.0f (%7s)\n";
531 my $fmt3 = "%3.2f%%";
533 print $fh "Gene detailed mapping results:\n\n";
535 print $fh sprintf($fmt1, "Gene type", "mapped", "lost (similar)",
538 print $fh ('-
'x108), "\n";
540 foreach my $class (@{ $self->get_all_classes('source
') }) {
541 next if ($class eq 'all
');
543 my $total = $self->get_count_by_class('source
', $class);
545 # avoid division by zero error
547 $self->logger->warning("No count found for $class.\n", 1);
551 my $mapped = $self->get_count_by_subclass('source
', $class, 'mapped
');
552 my $similar = $self->get_count_by_subclass('source
', $class,
554 my $lost = $self->get_count_by_subclass('source
', $class, 'lost_definite
');
556 print $fh sprintf($fmt2,
558 $mapped, sprintf($fmt3, $mapped/$total*100),
559 $similar, sprintf($fmt3, $similar/$total*100),
560 $lost, sprintf($fmt3, $lost/$total*100));
567 =head2 create_clicklist
569 Example : $analyser->create_clicklist;
570 Description : Writes an html file which contains a list of all lost genes,
571 with hyperlinks to the appropriate archive website. This is to
572 manually check lost genes.
581 sub create_clicklist {
584 my $fh = $self->get_filehandle('genes_lost.html
', 'stats
');
587 print $fh qq(<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n);
588 print $fh qq(<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-gb" lang="en-gb">);
589 print $fh "<head>\n";
590 print $fh "<title>Lost genes ";
591 print $fh $self->conf->param('sourcedbname
'), ' ->
',
592 $self->conf->param('targetdbname
');
593 print $fh "</title>\n";
594 print $fh "</head>\n<body>\n";
596 my $prefix = $self->conf->param('urlprefix
');
598 $self->logger->warning("No urlprefix set, clicklists might not be useable.\n", 1);
604 foreach my $class (@{ $self->get_all_classes('source
') }) {
605 next if ($class eq 'all
');
607 $navigation .= "$class ";
608 $clicklist .= "<h1>$class</h1>\n";
610 foreach my $subclass (qw(lost_similar lost_definite)) {
613 $navigation .= qq(<a href="#${class}-$subclass">$subclass</a> );
616 $clicklist .= "<h2>$subclass</h2>\n";
618 foreach my $stable_id (@{ $self->get_all_by_subclass('source
', $class, $subclass) }) {
619 $clicklist .= qq(<a href="${prefix}$stable_id">$stable_id</a><br />\n);
624 $navigation .= "<br />\n";
627 # print navigation and clicklist
628 print $fh "$navigation\n\n";
629 print $fh "$clicklist\n\n";
632 print $fh "</body></html>\n";
638 =head2 create_mapping_summary
640 Example : $analyser->create_mapping_summary();
641 Description : Writes a text file containing a summary of the mapping stats.
642 This will be emailed to the genebuilder for evaluation (you will
643 have to manually send the email, using the text in
644 "mapping_summary.txt" as the template).
653 sub create_mapping_summary {
656 my $fh = $self->get_filehandle('mapping_summary.txt
');
661 print $fh qq(Stable ID mapping results\n);
662 print $fh qq(=========================\n\n);
667 print $fh "Run at: ".localtime()."\n";
668 print $fh "Runtime: ";
669 print $fh $self->logger->runtime, "\n\n";
672 # parameters used for this run
674 print $fh $self->conf->list_param_values;
680 foreach my $type (qw(exon transcript translation gene gene_detailed)) {
681 my $filename = "${type}_mapping_stats.txt";
683 if ($self->file_exists($filename, 'stats
')) {
684 print $fh $self->read_from_file($filename, 'stats
');
687 print $fh "No mapping stats found for $type.\n\n";
695 ['stable_ids
' => 'Stable IDs
'],
696 ['events
' => 'Stable ID events and mapping session
'],
700 my $fmt1 = "%-40s%-20s\n";
702 print $fh qq(Data uploaded to db:\n);
703 print $fh qq(====================\n\n);
705 if ($self->conf->param('dry_run
')) {
707 print $fh "None (dry run).\n";
711 foreach my $u (@uploads) {
713 $uploaded = 'yes
' if ($self->conf->is_true("upload_".$u->[0]));
714 print $fh sprintf($fmt1, $u->[1], $uploaded);
722 # stats and clicklist
725 ['stats
' => 'statistics (including clicklists of deleted IDs)
'],
726 ['debug' => 'detailed mapping output
for debugging
'],
727 ['tables
' => 'data files
for db upload
'],
730 my $fmt2 = "%-20s%-50s\n";
732 print $fh qq(\nOutput directories:\n);
733 print $fh qq(===================\n\n);
735 print $fh sprintf($fmt2, qw(DIRECTORY DESCRIPTION));
736 print $fh ('-
'x72), "\n";
738 print $fh sprintf($fmt2, 'basedir
', $self->conf->param('basedir
'));
740 foreach my $o (@output) {
741 print $fh sprintf($fmt2, '$basedir/
'.$o->[0], $o->[1]);
747 # clicklist of first 10 deleted genes
749 print $fh qq(\nFirst 10 deleted known genes:\n);
750 print $fh qq(=============================\n\n);
752 my $in_fh = $self->get_filehandle('genes_lost.txt
', 'debug', '<
');
753 my $prefix = $self->conf->param('urlprefix
');
760 my ($stable_id, $type) = split(/\s+/);
762 next unless ($type eq 'known
');
764 print $fh sprintf($fmt2, $stable_id, "${prefix}$stable_id");
772 =head2 read_from_file
774 Arg[1] : String $filename - name of file to read
775 Arg[2] : (optional) String $append - directory name to append to basedir
776 Example : my $stats_text = $analyser->read_from_file('gene_mapping_stats
',
778 Description : Reads mapping stats from a file.
789 my $filename = shift;
792 my $in_fh = $self->get_filehandle($filename, $append, '<
');