ensembl-hive  2.6
Archiver.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 Copyright [2016-2024] EMBL-European Bioinformatics Institute
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 Bio::EnsEMBL::IdMapping::Archiver - create gene_archive and peptide_archive
34 
35 =head1 SYNOPSIS
36 
37  my $archiver = Bio::EnsEMBL::IdMapping::Archiver->new(
38  -LOGGER => $logger,
39  -CONF => $conf,
40  -CACHE => $cache
41  );
42 
43  # create gene and peptide archive
44  $archiver->create_archive($mapping_session_id);
45 
46  # dump existing archive tables to file
47  my $num_entries =
48  $archiver->dump_table_to_file( 'source', 'gene_archive',
49  'gene_archive_existing.txt', 1 );
50 
51 =head1 DESCRIPTION
52 
53 This module creates the gene_archive and peptide_archive
54 tables. Data is written to a file as tab-delimited text for
55 loading into a MySQL database (this can be done manually, or using
56 StableIdmapper->upload_file_into_table()).
57 
58 An archive entry for a given source gene is created if no target
59 gene exists, or if any of its transcripts or their translations
60 changed. Non-coding transcripts only have an entry in gene_archive (i.e.
61 without a corresponding peptide_archive entry).
62 
63 =head1 METHODS
64 
65  create_archive
66  dump_gene
67  dump_tuple
68  dump_nc_row
69  mapping_session_id
70 
71 =cut
72 
73 
74 package Bio::EnsEMBL::IdMapping::Archiver;
75 
76 use strict;
77 use warnings;
78 no warnings 'uninitialized';
79 
82 
83 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
84 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
85 use Digest::MD5 qw(md5_hex);
86 
87 
88 # instance variables
89 my $pa_id;
90 
91 
92 =head2 create_archive
93 
94  Arg[1] : Int $mapping_session_id - the mapping_session_id for this run
95  Example : $archiver->create_archive($stable_id_mapper->mapping_session_id);
96  Description : Creates the gene_archive and peptide_archive tables and writes
97  the data to a tab-delimited file. The decision as to what to
98  archive is deferred to dump_gene(), see documentation there for
99  details.
100  Return type : none
101  Exceptions : Thrown on missing argument.
102  Caller : id_mapping.pl
103  Status : At Risk
104  : under development
105 
106 =cut
107 
108 sub create_archive {
109  my $self = shift;
110  my $mapping_session_id = shift;
111 
112  # argument check
113  unless ($mapping_session_id) {
114  $self->logger->warning("No mapping_session_id set.");
115  }
116 
117  $self->mapping_session_id($mapping_session_id);
118 
119  # get filehandles to write gene and peptide archive
120  my $ga_fh = $self->get_filehandle('gene_archive_new.txt', 'tables');
121  my $pa_fh = $self->get_filehandle('peptide_archive_new.txt', 'tables');
122 
123  # get the currently highest peptide_archive_id from the source db
124  my $s_dba = $self->cache->get_DBAdaptor('source');
125  my $s_dbh = $s_dba->dbc->db_handle;
126  my $sql = qq(SELECT MAX(peptide_archive_id) FROM peptide_archive);
127  $pa_id = $self->fetch_value_from_db($s_dbh, $sql);
128 
129  unless ($pa_id) {
130  $self->logger->warning("No max(peptide_archive_id) found in db.\n", 1);
131  $self->logger->info("That's ok if this is the first stable ID mapping for this species.\n", 1);
132  }
133 
134  $pa_id++;
135  $self->logger->debug("Starting with peptide_archive_id $pa_id.\n");
136 
137  # lookup hash of target gene stable IDs
138  my %target_genes = map { $_->stable_id => $_ }
139  values %{ $self->cache->get_by_name("genes_by_id", 'target') };
140 
141  # loop over source genes and dump to archive (dump_gene() will decide whether
142  # to do full or partial dump)
143  foreach my $source_gene (values %{ $self->cache->get_by_name("genes_by_id",
144  'source') }) {
145 
146  $self->dump_gene($source_gene, $target_genes{$source_gene->stable_id},
147  $ga_fh, $pa_fh);
148  }
149 
150  close($ga_fh);
151  close($pa_fh);
152 }
153 
154 
155 =head2 dump_gene
156 
157  Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $s_gene - source gene
158  Arg[2] : Bio::EnsEMBL::IdMapping::TinyGene $t_gene - target gene
159  Arg[3] : Filehandle $ga_fh - filehandle for writing gene_archive data
160  Arg[4] : Filehandle $pa_fh - filehandle for writing peptide_archive data
161  Example : my $target_gene = $gene_mappings{$source_gene->stable_id};
162  $archiver->dump_gene($source_gene, $target_gene, $ga_fh, $pa_fh);
163  Description : Given a source gene, it will write a gene_achive and
164  peptide_achive entry for it if no target gene exists, or if any
165  of its transcripts or their translation changed.
166  Return type : none
167  Exceptions : none
168  Caller : create_archive()
169  Status : At Risk
170  : under development
171 
172 =cut
173 
174 sub dump_gene {
175  my ($self, $s_gene, $t_gene, $ga_fh, $pa_fh) = @_;
176 
177  # private method, so no argument check done for performance reasons
178 
179  # loop over source transcripts
180  foreach my $s_tr (@{ $s_gene->get_all_Transcripts }) {
181 
182  my $changed_flag = 1;
183  my $t_tl;
184  my $s_tl = $s_tr->translation;
185 
186  if ( $t_gene ) {
187 
188  foreach my $t_tr (@{ $t_gene->get_all_Transcripts }) {
189 
190  $t_tl = $t_tr->translation;
191 
192  # If there is a translation, there should also be a target translation
193  # and both transcript and translation should have same stable id and version
194  if ( $s_tl ) {
195 
196  if ( $t_tl ) {
197 
198  if ( $s_tl->stable_id eq $t_tl->stable_id
199  and $s_tl->version == $t_tl->version
200  and $s_tr->stable_id eq $t_tr->stable_id
201  and $s_tr->version == $t_tr->version ) {
202 
203  $changed_flag = 0;
204 
205  }
206  }
207  } else {
208 
209  # nothing changed if source transcript has a target transcript with same stable id and version
210  if ( $s_tr->stable_id eq $t_tr->stable_id
211  and $s_tr->version == $t_tr->version ) {
212 
213  $changed_flag = 0;
214 
215  }
216  }
217  }
218  }
219 
220  if ($changed_flag) {
221  $self->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh);
222  }
223  }
224 }
225 
226 
227 =head2 dump_tuple
228 
229  Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $gene - gene to archive
230  Arg[2] : Bio::EnsEMBL::IdMapping::TinyTrancript $tr - its transcript
231  Arg[3] : Bio::EnsEMBL::IdMapping::TinyTranslation $tl - its translation
232  Arg[4] : Filehandle $ga_fh - filehandle for writing gene_archive data
233  Arg[5] : Filehandle $pa_fh - filehandle for writing peptide_archive data
234  Example : $archive->dump_tuple($s_gene, $s_tr, $s_tl, $ga_fh, $pa_fh);
235  Description : Writes entry lines for gene_archive and peptide_archive.
236  Return type : none
237  Exceptions : none
238  Caller : dump_gene()
239  Status : At Risk
240  : under development
241 
242 =cut
243 
244 sub dump_tuple {
245  my ($self, $gene, $tr, $tl, $ga_fh, $pa_fh) = @_;
246 
247  # private method, so no argument check done for performance reasons
248 
249  # gene archive
250  print $ga_fh join("\t",
251  $gene->stable_id,
252  $gene->version,
253  $tr->stable_id,
254  $tr->version
255  );
256 
257  print $ga_fh "\t";
258 
259  if ( $tl ) {
260  print $ga_fh join("\t",
261  $tl->stable_id,
262  $tl->version,
263  $pa_id,
264  $self->mapping_session_id
265  );
266 
267  # peptide archive
268  my $pep_seq = $tl->seq;
269  print $pa_fh join("\t", $pa_id, md5_hex($pep_seq), $pep_seq);
270  print $pa_fh "\n";
271  # increment peptide_archive_id
272  $pa_id++;
273  } else {
274  print $ga_fh join ("\t",
275  '\N',
276  '\N',
277  '\N',
278  $self->mapping_session_id
279  );
280  }
281  print $ga_fh "\n";
282 }
283 
284 
285 =head2 mapping_session_id
286 
287  Arg[1] : (optional) Int - mapping_session_id to set
288  Example : my $msi = $archiver->mapping_session_id;
289  Description : Getter/setter for mapping_session_id.
290  Return type : Int
291  Exceptions : none
292  Caller : create_archive()
293  Status : At Risk
294  : under development
295 
296 =cut
297 
298 sub mapping_session_id {
299  my $self = shift;
300  $self->{'_mapping_session_id'} = shift if (@_);
301  return $self->{'_mapping_session_id'};
302 }
303 
304 
305 1;
306 
transcript
public transcript()
Bio::EnsEMBL::IdMapping::TinyFeature::stable_id
public String stable_id()
map
public map()
Bio::EnsEMBL::IdMapping::BaseObject
Definition: BaseObject.pm:25
Bio::EnsEMBL::Utils::ScriptUtils
Definition: ScriptUtils.pm:11
archive
public archive()
Bio::EnsEMBL::IdMapping::BaseObject::new
public $this new()
Bio::EnsEMBL::IdMapping::TinyTranslation
Definition: TinyTranslation.pm:27
Bio::EnsEMBL::IdMapping::TinyGene
Definition: TinyGene.pm:28
run
public run()
Bio::EnsEMBL::IdMapping::Archiver
Definition: Archiver.pm:39
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68