ensembl-hive  2.6
id_mapping.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
3 # Copyright [2016-2024] EMBL-European Bioinformatics Institute
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 # Don't change the above line.
18 # Change the PATH in the myRun.ksh script if you want to use another perl.
19 
20 =head1 NAME
21 
22 
23 =head1 SYNOPSIS
24 
25 .pl [arguments]
26 
27 Required arguments:
28 
29  --dbname, db_name=NAME database name NAME
30  --host, --dbhost, --db_host=HOST database host HOST
31  --port, --dbport, --db_port=PORT database port PORT
32  --user, --dbuser, --db_user=USER database username USER
33  --pass, --dbpass, --db_pass=PASS database passwort PASS
34 
35 Optional arguments:
36 
37  --conffile, --conf=FILE read parameters from FILE
38  (default: conf/Conversion.ini)
39 
40  --logfile, --log=FILE log to FILE (default: *STDOUT)
41  --logpath=PATH write logfile to PATH (default: .)
42  --logappend, --log_append append to logfile (default: truncate)
43  --loglevel=LEVEL define log level (default: INFO)
44 
45  -i, --interactive=0|1 run script interactively (default: true)
46  -n, --dry_run, --dry=0|1 don't write results to database
47  -h, --help, -? print help (this message)
48 
49 =head1 DESCRIPTION
50 
51 
52 
53 =head1 AUTHOR
54 
55 Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
56 
57 =head1 CONTACT
58 
59 Please post comments/questions to the Ensembl development list
60 <http://lists.ensembl.org/mailman/listinfo/dev>
61 
62 =cut
63 
64 use strict;
65 use warnings;
66 no warnings 'uninitialized';
67 
68 use FindBin qw($Bin);
69 use Bio::EnsEMBL::Utils::ConfParser;
70 use Bio::EnsEMBL::Utils::Logger;
71 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
72 use Bio::EnsEMBL::IdMapping::Cache;
73 use Bio::EnsEMBL::IdMapping::ExonScoreBuilder;
74 use Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder;
75 use Bio::EnsEMBL::IdMapping::GeneScoreBuilder;
76 use Bio::EnsEMBL::IdMapping::InternalIdMapper;
77 use Bio::EnsEMBL::IdMapping::StableIdMapper;
78 use Bio::EnsEMBL::IdMapping::Archiver;
79 use Bio::EnsEMBL::IdMapping::ResultAnalyser;
80 
81 #use Devel::Size qw(size total_size);
82 #use Data::Dumper;
83 #$Data::Dumper::Indent = 1;
84 
85 # parse configuration and commandline arguments
86 my $conf = new Bio::EnsEMBL::Utils::ConfParser(
87  -SERVERROOT => "$Bin/../../..",
88  -DEFAULT_CONF => "$Bin/default.conf"
89 );
90 
91 $conf->parse_options(
92  'mode=s' => 0,
93  'basedir|basedir=s' => 1,
94  'chromosomes|chr=s@' => 0,
95  'region=s' => 0,
96  'biotypes=s@' => 0,
97  'biotypes_include=s@' => 0,
98  'biotypes_exclude=s@' => 0,
99  'min_exon_length|minexonlength=i' => 0,
100  'exonerate_path|exoneratepath=s' => 1,
101  'exonerate_threshold|exoneratethreshold=f' => 0,
102  'exonerate_jobs|exoneratejobs=i' => 0,
103  'exonerate_bytes_per_job|exoneratebytesperjob=f' => 0,
104  'exonerate_extra_params|exonerateextraparams=s' => 0,
105  'plugin_internal_id_mappers_gene=s@' => 0,
106  'plugin_internal_id_mappers_transcript=s@' => 0,
107  'plugin_internal_id_mappers_exon=s@' => 0,
108  'mapping_types=s@' => 1,
109  'plugin_stable_id_generator=s' => 0,
110  'upload_events|uploadevents=s' => 0,
111  'upload_stable_ids|uploadstableids=s' => 0,
112  'upload_archive|uploadarchive=s' => 0,
113 );
114 
115 # set default logpath
116 unless ($conf->param('logpath')) {
117  $conf->param('logpath', path_append($conf->param('basedir'), 'log'));
118 }
119 
120 # get log filehandle and print heading and parameters to logfile
121 my $logger = new Bio::EnsEMBL::Utils::Logger(
122  -LOGFILE => $conf->param('logfile'),
123  -LOGAUTO => $conf->param('logauto'),
124  -LOGAUTOBASE => 'id_mapping',
125  -LOGAUTOID => $conf->param('logautoid'),
126  -LOGPATH => $conf->param('logpath'),
127  -LOGAPPEND => $conf->param('logappend'),
128  -LOGLEVEL => $conf->param('loglevel'),
129  -IS_COMPONENT => $conf->param('is_component'),
130 );
131 
132 # initialise log
133 $logger->init_log($conf->list_param_values);
134 
135 
136 # instance variables
137 my $esb;
138 my $tsb;
139 my $gsb;
140 my $exon_scores;
141 my $transcript_scores;
142 my $gene_scores;
143 my $exon_mappings;
144 my $transcript_mappings;
145 my $gene_mappings;
146 my $translation_mappings;
147 
148 # loading cache from file
149 my $cache = Bio::EnsEMBL::IdMapping::Cache->new(
150  -LOGGER => $logger,
151  -CONF => $conf,
152  -LOAD_INSTANCE => 1,
153 );
154 
155 
156 # get a stable ID mapper
157 my $stable_id_mapper = Bio::EnsEMBL::IdMapping::StableIdMapper->new(
158  -LOGGER => $logger,
159  -CONF => $conf,
160  -CACHE => $cache
161 );
162 
163 
164 # find out which entities we want to map
165 my %mapping_types = ();
166 foreach my $type ($conf->param('mapping_types')) {
167  $mapping_types{$type} = 1;
168 }
169 
170 
171 # run in requested mode
172 my $mode = $conf->param('mode') || 'normal';
173 if ( $mode eq 'mapping' ) { $mode = 'normal' }
174 my $run = "run_$mode";
175 no strict 'refs';
176 &$run;
177 
178 
179 # finish logfile
180 $logger->finish_log;
181 
182 
183 ### END main ###
184 
185 sub run_normal {
186 
187  # build scores
188  &build_scores;
189 
190  # map stable IDs
191  &map;
192 
193  # assign stable IDs and make creation and deletion events
194  &assign_stable_ids;
195 
196  # generate similarity events
197  &generate_similarity_events;
198 
199  # dump existing stable_id_event table to file
200  &dump_existing_events;
201 
202  # create gene and peptide archive
203  &archive($stable_id_mapper->mapping_session_id);
204 
205  # upload table data files into db
206  &upload_mapping_session_and_events;
207  &upload_stable_ids;
208  &upload_archive;
209 
210  # final stats and mapping summary
211  &analyse_results;
212 }
213 
214 
215 sub run_upload {
216  # upload table data files into db
217  &upload_mapping_session_and_events;
218  &upload_stable_ids;
219  &upload_archive;
220 }
221 
222 
223 sub build_scores {
224 
225  # get new ScoreBuilders for exons, transcripts and genes
226  $esb = Bio::EnsEMBL::IdMapping::ExonScoreBuilder->new(
227  -LOGGER => $logger,
228  -CONF => $conf,
229  -CACHE => $cache
230  );
231  $tsb = Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder->new(
232  -LOGGER => $logger,
233  -CONF => $conf,
234  -CACHE => $cache
235  );
236  $gsb = Bio::EnsEMBL::IdMapping::GeneScoreBuilder->new(
237  -LOGGER => $logger,
238  -CONF => $conf,
239  -CACHE => $cache
240  );
241 
242  # exon scoring
243  $exon_scores = $esb->score_exons;
244 
245  # transcript scoring
246  $transcript_scores = $tsb->score_transcripts($exon_scores);
247 
248  # gene scoring
249  $gene_scores = $gsb->score_genes($transcript_scores);
250 }
251 
252 
253 sub map {
254 
255  # get an internal ID mapper
256  my $internal_id_mapper = Bio::EnsEMBL::IdMapping::InternalIdMapper->new(
257  -LOGGER => $logger,
258  -CONF => $conf,
259  -CACHE => $cache
260  );
261 
262  # map genes
263  $gene_mappings = $internal_id_mapper->map_genes($gene_scores,
264  $transcript_scores, $gsb);
265 
266  # map transcripts
267  if ($mapping_types{'transcript'} or $mapping_types{'exon'} or
268  $mapping_types{'translation'}) {
269  $transcript_mappings = $internal_id_mapper->map_transcripts(
270  $transcript_scores, $gene_mappings, $tsb);
271  }
272 
273  # map exons
274  if ($mapping_types{'exon'}) {
275  $exon_mappings = $internal_id_mapper->map_exons($exon_scores,
276  $transcript_mappings, $esb);
277  }
278 
279  # map translations
280  if ($mapping_types{'translation'}) {
281  $translation_mappings = $internal_id_mapper->map_translations(
282  $transcript_mappings);
283  }
284 }
285 
286 
287 sub assign_stable_ids {
288 
289  #
290  # assign stable IDs
291  #
292 
293  # exons
294  if ($mapping_types{'exon'}) {
295  $stable_id_mapper->map_stable_ids($exon_mappings, 'exon');
296  }
297 
298  # transcripts
299  if ($mapping_types{'transcript'}) {
300  $stable_id_mapper->map_stable_ids($transcript_mappings, 'transcript');
301  }
302 
303  # translations
304  if ($mapping_types{'translation'}) {
305  $stable_id_mapper->map_stable_ids($translation_mappings, 'translation');
306  }
307 
308  # genes
309  if ($mapping_types{'gene'}) {
310  $stable_id_mapper->map_stable_ids($gene_mappings, 'gene');
311  }
312 
313 
314  # dump mappings to file for debug purposes
315  $stable_id_mapper->dump_debug_mappings;
316 
317  # write stable_id_events to file
318  $stable_id_mapper->write_stable_id_events('new');
319 
320 }
321 
322 
323 sub generate_similarity_events {
324 
325  $logger->info("Generating similarity events...\n", 0, 'stamped');
326 
327  # genes
328  if ($mapping_types{'gene'}) {
329  $logger->debug("genes\n", 1);
330  $stable_id_mapper->generate_similarity_events($gene_mappings, $gene_scores,
331  'gene');
332  }
333 
334  # transcripts
335  my $filtered_transcript_scores;
336  if ($mapping_types{'transcript'} or $mapping_types{'translation'}) {
337  $filtered_transcript_scores =
338  $stable_id_mapper->filter_same_gene_transcript_similarities(
339  $transcript_scores);
340  }
341 
342  if ($mapping_types{'transcript'}) {
343  $logger->debug("transcripts\n", 1);
344  $stable_id_mapper->generate_similarity_events($transcript_mappings,
345  $filtered_transcript_scores, 'transcript');
346  }
347 
348  # translations
349  if ($mapping_types{'translation'}) {
350  $logger->debug("translations\n", 1);
351  $stable_id_mapper->generate_translation_similarity_events(
352  $translation_mappings, $filtered_transcript_scores);
353  }
354 
355  # write stable_id_events to file
356  $stable_id_mapper->write_stable_id_events('similarity');
357 
358  # write_retrofit_stable_id_events?? [todo]
359 
360  $logger->info("Done.\n\n", 0, 'stamped');
361 }
362 
363 
364 sub dump_existing_events {
365  $logger->info("Dumping existing stable_id_events...\n", 0, 'stamped');
366 
367  my $i = $stable_id_mapper->dump_table_to_file('source', 'stable_id_event',
368  'stable_id_event_existing.txt', 1);
369 
370  $logger->info("Done writing $i entries.\n\n", 0, 'stamped');
371 }
372 
373 
374 sub archive {
375  my $mapping_session_id = shift;
376 
377  $logger->info("Create gene and peptide archive...\n", 0, 'stamped');
378 
379  # get an Archiver
380  my $archiver = Bio::EnsEMBL::IdMapping::Archiver->new(
381  -LOGGER => $logger,
382  -CONF => $conf,
383  -CACHE => $cache
384  );
385 
386  # create gene and peptide archive
387  $archiver->create_archive($mapping_session_id);
388 
389  $logger->info("Done.\n\n", 0, 'stamped');
390 
391  # dump existing archive tables to file
392  $logger->info("Dumping existing gene and peptide archive...\n", 0, 'stamped');
393 
394  my $i = $archiver->dump_table_to_file('source', 'gene_archive',
395  'gene_archive_existing.txt', 1);
396  my $j = $archiver->dump_table_to_file('source', 'peptide_archive',
397  'peptide_archive_existing.txt', 1);
398 
399  $logger->info("Done writing $i gene_archive and $j peptide_archive entries.\n\n", 0, 'stamped');
400 }
401 
402 
403 sub upload_mapping_session_and_events {
404  if ($conf->is_true('upload_events') and ! $conf->param('dry_run')) {
405 
406  $logger->info("Uploading mapping_session and stable_id_event tables...\n");
407 
408  my $i = 0;
409  my $j = 0;
410 
411  $logger->info("mapping_session...\n", 1);
412  $i += $stable_id_mapper->upload_file_into_table('target', 'mapping_session',
413  'mapping_session.txt');
414  $logger->info("$i\n", 1);
415 
416  $logger->info("stable_id_event...\n", 1);
417  $j += $stable_id_mapper->upload_file_into_table('target', 'stable_id_event',
418  'stable_id_event_existing.txt');
419  $j += $stable_id_mapper->upload_file_into_table('target', 'stable_id_event',
420  'stable_id_event_new.txt', 1);
421  $j += $stable_id_mapper->upload_file_into_table('target', 'stable_id_event',
422  'stable_id_event_similarity.txt', 1);
423  $logger->info("$j\n", 1);
424 
425  $logger->info("Done.\n\n");
426 
427  } else {
428  $logger->info("Stable ID event and mapping session tables not uploaded.\n\n");
429  }
430 }
431 
432 
433 sub upload_stable_ids {
434  if ($conf->is_true('upload_stable_ids') and ! $conf->param('dry_run')) {
435 
436  $logger->info("Uploading stable ID tables...\n");
437 
438  foreach my $t ($conf->param('mapping_types')) {
439  $logger->info("${t}_stable_id...\n", 1);
440  my $i = $stable_id_mapper->upload_file_into_table('target',
441  "${t}_stable_id", "${t}_stable_id.txt");
442  $logger->info("$i\n", 1);
443  }
444 
445  $logger->info("Done.\n\n");
446 
447  } else {
448  $logger->info("Stable ID tables not uploaded.\n\n");
449  }
450 }
451 
452 
453 sub upload_archive {
454  if ($conf->is_true('upload_archive') and ! $conf->param('dry_run')) {
455 
456  $logger->info("Uploading gene and peptide tables...\n");
457 
458  foreach my $t (qw(gene peptide)) {
459  $logger->info("${t}_archive...\n", 1);
460  my $i = 0;
461  $i += $stable_id_mapper->upload_file_into_table('target', "${t}_archive",
462  "${t}_archive_existing.txt", 1);
463  $i += $stable_id_mapper->upload_file_into_table('target', "${t}_archive",
464  "${t}_archive_new.txt", 1);
465  $logger->info("$i\n", 1);
466  }
467 
468  $logger->info("Done.\n\n");
469 
470  } else {
471  $logger->info("Gene and peptide archive tables not uploaded.\n\n");
472  }
473 }
474 
475 
476 sub analyse_results {
477 
478  $logger->info("Analysing results...\n", 0, 'stamped');
479 
480  # get a result analyser
481  my $analyser = Bio::EnsEMBL::IdMapping::ResultAnalyser->new(
482  -LOGGER => $logger,
483  -CONF => $conf,
484  -CACHE => $cache
485  );
486 
487  # analyse results
488  $analyser->analyse($gene_mappings,
489  $stable_id_mapper->get_all_stable_id_events('similarity'));
490 
491  # write results to file
492  $analyser->write_results_to_file;
493 
494  # create click lists
495  $analyser->create_clicklist;
496 
497  $logger->info("Done.\n\n", 0, 'stamped');
498 
499  # mapping summary
500  $logger->info("Creating mapping summary...\n", 0, 'stamped');
501  $analyser->create_mapping_summary;
502  $logger->info("Done.\n", 0, 'stamped');
503 }
504 
505 
506 #
507 # test memory consumption of cache after merging. used for debugging.
508 #
509 sub log_cache_stats {
510  $logger->info("\nCache memory usage:\n\n");
511 
512  my $s;
513  my %keys;
514 
515  $keys{'cache'} = size($cache->{'cache'});
516 
517  foreach my $name (keys %{ $cache->{'cache'} }) {
518  $keys{$name} = size($cache->{'cache'}->{$name});
519  foreach my $type (keys %{ $cache->{'cache'}->{$name} }) {
520  $keys{$type} = size($cache->{'cache'}->{$name}->{$type});
521  $s += size($cache->{'cache'}->{$name}->{$type});
522  }
523  }
524 
525  my $ts = total_size($cache->{'cache'});
526 
527  my $fmt = "%-50s%12.0f\n";
528 
529  foreach my $k (sort { $keys{$a} <=> $keys{$b} } keys %keys) {
530  $logger->info(sprintf($fmt, $k, $keys{$k}), 1);
531  }
532  $logger->info(sprintf($fmt, "total overhead", $s), 1);
533  $logger->info(sprintf($fmt, "data", ($ts-$s)), 1);
534  $logger->info(sprintf($fmt, "total", $ts)."\n", 1);
535 
536  # test
537  my $i = 0;
538  foreach my $eid (keys %{ $cache->get_by_name('exons_by_id', 'target') }) {
539  last if ($i++ > 0);
540 
541  $logger->info("\nData object memory usage:\n\n");
542 
543  my $exon = $cache->get_by_key('exons_by_id', 'target', $eid);
544  my $s1 = size($exon);
545  my $ts1 = total_size($exon);
546 
547  $logger->info(sprintf($fmt, "object", $s1), 1);
548  $logger->info(sprintf($fmt, "data", ($ts1-$s1)), 1);
549  $logger->info(sprintf($fmt, "total", $ts1)."\n", 1);
550 
551  print $exon->stable_id."\n";
552  #warn Data::Dumper::Dumper($exon);
553  }
554 }
555 
556 
transcript
public transcript()
upload_stable_ids
public upload_stable_ids()
exon
public exon()
run
public run()
upload_archive
public upload_archive()