ensembl-hive  2.7.0
OfficialNaming.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 package XrefMapper::OfficialNaming;
21 
22 use strict;
23 use warnings;
24 use Carp;
25 use Cwd;
26 use DBI;
27 use File::Basename;
28 
29 use vars '@ISA';
30 use base qw( XrefMapper::BasicMapper);
31 #@ISA = qw{ XrefMapper::BasicMapper };
32 
33 
34 ###############################################################################################
35 # Run the offical naming code.
36 #
37 # At present this is done for the following species ONLY :-
38 # ZebraFish (ZFIN_ID),
39 # Human (HGNC)
40 # Mouse (MGI)
41 # Rat (RGD)
42 # Pig (PIGGY)
43 # There is currently no official domain source for pig, but it has manual annotation
44 # We use PIGGY as a fake official naming source
45 #
46 # 1) So we find the best official name for each gene
47 # order for this is:-
48 # i) official domain name source (HGNC, MGI, ZFIN_ID, RGD)
49 # ii) RFAM
50 # iii) miRBase
51 # iv) EntrezGene
52 #
53 # NOTE: for "i)" above, if more than one exists we find the "best" one if possible
54 # and remove the other ones. If there is more than one "best" we keep all and
55 # just choose the first one for the name
56 #
57 # To find the "best" one we use the priority.
58 # Priority should be set correctly in the xref_config.ini file to use
59 # first any names coming from the official naming source
60 #
61 # Set this as the display_xref for the gene.
62 #
63 # 2) Foreach Transcript of that gene
64 #
65 # we assign a transcript extension (splice number?)
66 # This is just a counter starting at 201 which is incremented each time
67 # We add this to the name to get a "XXX_trans_name"xref where XXX is the
68 # type of source used to get the name. This is then added as an xref and
69 # is set to the display_xref for that transcript.
70 #
71 ##############################################################################################
72 
73 
74 ####################################
75 # Create OfficialNaming object
76 # Get some info from the BasicMapper
77 ####################################
78 sub new {
79  my($class, $mapper) = @_;
80 
81  my $self ={};
82  bless $self,$class;
83  $self->core($mapper->core);
84  $self->xref($mapper->xref);
85  $self->get_official_name($mapper->get_official_name);
86  return $self;
87 }
88 
89 
90 ##################################################
91 # This will be the offical database name
92 # HGNC, MGI, ZFIN_ID or PIGGY, comes from BasicMapper
93 #################################################
94 sub get_official_name {
95  my ($self, $arg) = @_;
96 
97  (defined $arg) &&
98  ($self->{_official_name} = $arg );
99  return $self->{_official_name};
100 }
101 
102 
103 
104 ##################################################
105 # This is the main subroutine that does everything
106 ##################################################
107 sub run {
108  my $self = shift;
109  my $species_id = shift;
110 
111  my $dbname = $self->get_official_name();
112  my $dbi = $self->xref->dbc;
113 
114  ###########################################################
115  # If no offical name then we do not want to go any further
116  # Just set status to official_naming_done and return
117  ###########################################################
118  if(!defined($dbname)){
119  $self->update_process_status("official_naming_done");
120  return;
121  }
122  $species_id = $self->get_id_from_species_name($self->core->species) unless defined $species_id;
123  $self->species_id($species_id);
124 
125 
126  ###########################################################
127  # If there are any official names on transcripts or translations
128  # move them onto gene level
129  #
130  # This is done for 2 reasons
131  # 1) to make the code the same as HGNC is on a gene
132  # and it makes it easier to find.
133  # 2) Later on these are copied to the canonical transcripts
134  # from the genes so move them now.
135  ###########################################################
136 
137  if($dbname eq "MGI"){ # Copy MGI to Genes
138  $self->biomart_fix("MGI","Translation","Gene");
139  $self->biomart_fix("MGI","Transcript","Gene");
140  }
141  if($dbname eq "ZFIN_ID"){ # Copy ZFIN_ID to Genes
142  $self->biomart_fix("ZFIN_ID","Translation","Gene");
143  $self->biomart_fix("ZFIN_ID","Transcript","Gene");
144  }
145  if($dbname eq "RGD"){ # Copy RGD to Genes
146  $self->biomart_fix("RGD","Translation","Gene");
147  $self->biomart_fix("RGD","Transcript","Gene");
148  }
149 
150 
151 
152  ######################################################
153  # Get the current max values for xref and object_xref
154  ######################################################
155  my ($max_object_xref_id, $max_xref_id) = $self->find_max_ids($dbi);
156 
157  my %display_label_to_desc;
158  $self->get_display_label_data(\%display_label_to_desc, $dbi);
159 
160  my %synonym;
161  $self->get_synonyms(\%synonym, $dbi);
162 
163 
164  # get the officail naming external_sources
165  my $dbname_to_source_id = $self->get_new_dbname_sources($dbi); # reference to hash
166 
167  ###########################
168  # Delete the old ones.
169  ###########################
170  $self->delete_old_data($dbname_to_source_id, $dbi);
171 
172  $self->reset_display_xrefs($dbi);
173 
174  my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(-dbconn => $self->core->dbc);
175  my $ga = $db->get_GeneAdaptor();
176 
177  my %gene_to_transcripts;
178  my %gene_id_to_stable_id;
179  my %tran_id_to_stable_id;
180 
181  my $sql =(<<'SQ0');
182 SELECT gtt.gene_id, gtt.transcript_id, gsi.stable_id, tsi.stable_id
183  FROM gene_transcript_translation gtt, gene_stable_id gsi, transcript_stable_id tsi
184  WHERE gtt.gene_id = gsi.internal_id AND
185  gtt.transcript_id = tsi.internal_id
186  ORDER BY gsi.stable_id, tsi.stable_id
187 SQ0
188 
189  my $sth = $dbi->prepare($sql);
190 
191  $sth->execute;
192  my ($gene_id, $tran_id, $gsi, $tsi);
193  $sth->bind_columns(\$gene_id, \$tran_id, \$gsi, \$tsi);
194  my @sorted_gene_ids;
195  while ($sth->fetch){
196  if(!defined($gene_to_transcripts{$gene_id})){
197  push @sorted_gene_ids, $gene_id;
198  }
199  push @{$gene_to_transcripts{$gene_id}}, $tran_id;
200  $gene_id_to_stable_id{$gene_id} = $gsi;
201  $tran_id_to_stable_id{$tran_id} = $tsi;
202  }
203 
204  my $dbentrie_sth = $self->get_dbentrie_sth($dbi);
205  my $ins_xref_sth = $self->get_ins_xref_sth($dbi);
206  my $ins_dep_ix_sth = $self->get_ins_dep_ix_sth($dbi);
207  my $ins_object_xref_sth = $self->get_ins_object_xref_sth($dbi);
208  my $set_gene_display_xref_sth = $self->get_set_gene_display_xref_sth($dbi);
209 
210  my %xref_added; # store those added $xref_added{$accession:$source_id} = $xref_id;
211  my %seen_gene;
212 
213  my %official_name_used;
214 
215  my $ignore_sql =<<IEG;
216  SELECT DISTINCT ox.object_xref_id
217  FROM object_xref ox, dependent_xref dx,
218  xref xmas, xref xdep,
219  source smas, source sdep
220  WHERE ox.xref_id = dx.dependent_xref_id AND
221  dx.dependent_xref_id = xdep.xref_id AND
222  dx.master_xref_id = xmas.xref_id AND
223  xmas.source_id = smas.source_id AND
224  xdep.source_id = sdep.source_id AND
225  smas.name like "Refseq%predicted" AND
226  sdep.name like "EntrezGene" AND
227  ox.ox_status = "DUMP_OUT"
228 IEG
229 
230  my %ignore_object;
231  my $ignore_sth = $dbi->prepare($ignore_sql);
232  $ignore_sth->execute();
233  my ($ignore_object_xref_id);
234  $ignore_sth->bind_columns(\$ignore_object_xref_id);
235  while($ignore_sth->fetch()){
236  $ignore_object{$ignore_object_xref_id} = 1;
237  }
238  $ignore_sth->finish;
239 
240  while ( my $gene_id = shift @sorted_gene_ids){
241 
242  my $tran_source = $dbname;
243 
244  # symbols to set when found.
245  my $gene_symbol = undef;
246  my $gene_symbol_xref_id = undef;
247  my $is_lrg = 0;
248 
249  ################################
250  # Get offical name if it has one
251  ################################
252  ($gene_symbol, $gene_symbol_xref_id) =
253  $self->get_official_domain_name({gene_id => $gene_id,
254  gene_to_tran => \%gene_to_transcripts,
255  gene_id_to_stable_id => \%gene_id_to_stable_id,
256  official_name_used => \%official_name_used,
257  dbi => $dbi
258  });
259 
260  if (defined($gene_symbol_xref_id)) {
261  $official_name_used{$gene_symbol_xref_id} = 1;
262  }
263 
264  ############################################
265  # If not found see if there is an LRG entry
266  ############################################
267  if(!defined($gene_symbol)){ # look for LRG
268  ($gene_symbol, $gene_symbol_xref_id, $is_lrg) = $self->find_lrg_hgnc($gene_id, $dbi);
269  }
270 
271  ####################################################
272  # If not found look for other valid database sources
273  # These are RFAM and miRBase, as well as EntrezGene
274  ####################################################
275  if(!defined($gene_symbol)){
276  ($gene_symbol, $gene_symbol_xref_id) =
277  $self->find_from_other_sources(\%ignore_object,
278  {gene_id => $gene_id,
279  label_to_desc => \%display_label_to_desc,
280  dbi => $dbi,
281  tran_source => \$tran_source});
282  }
283 
284  if(defined($gene_symbol)){
285  my $desc = $display_label_to_desc{$gene_symbol};
286  $set_gene_display_xref_sth->execute($gene_symbol_xref_id, $gene_id);
287 
288  if (!$is_lrg) {
289  $self->set_transcript_display_xrefs({ max_xref => \$max_xref_id,
290  max_object => \$max_object_xref_id,
291  gene_id => $gene_id,
292  gene_id_to_stable_id => \%gene_id_to_stable_id,
293  gene_symbol => $gene_symbol,
294  desc => $desc,
295  dbi => $dbi,
296  source_id => $dbname_to_source_id->{$tran_source."_trans_name"},
297  xref_added => \%xref_added,
298  seen_gene => \%seen_gene,
299  gene_to_tran => \%gene_to_transcripts,
300  tran_source => $tran_source,
301  });
302  }
303  }
304 
305  } # for each gene
306 
307  $self->update_process_status('official_naming_done');
308  return;
309 }
310 
311 
312 
313 
314 ####################################################################
315 # Get offical name if it has one
316 #
317 # Search gene for dbname entries.
318 # dbname (HGNC||MGI||ZFIN_ID|RGD) dependent on species
319 #
320 # Find the "best" one
321 # Remove the lesser ones (set status to MULTI_DELETE for object_xref)
322 #
323 # return the gene_symbol and xref_id of the best one
324 ######################################################################
325 
326 sub get_official_domain_name{
327  my ($self, $arg_ref) = @_;
328 
329  my $gene_id = $arg_ref->{gene_id};
330  my $gene_id_to_stable_id = $arg_ref->{gene_id_to_stable_id};
331  my $gene_to_transcripts = $arg_ref->{gene_to_tran};
332  my $official_name_used = $arg_ref->{official_name_used};
333  my $dbi = $arg_ref->{dbi};
334 
335 
336  my $dbname = $self->get_official_name();
337  my $gene_symbol = undef;
338  my $gene_symbol_xref_id = undef;
339 
340 
341  my $dbentrie_sth = $self->get_dbentrie_sth($dbi);
342 
343  my %ODN=();
344  my %xref_id_to_display;
345 
346  $dbentrie_sth->execute($dbname, $gene_id, "Gene");
347  my ($display, $xref_id, $object_xref_id, $level);
348  $dbentrie_sth->bind_columns(\$display, \$xref_id, \$object_xref_id, \$level);
349  my $best_level=999;
350 
351  my $count = 0;
352  my @list=();
353  my @list_ox=();
354 
355  while($dbentrie_sth->fetch){
356 
357  push @list, $xref_id;
358  push @list_ox, $object_xref_id;
359  $count++;
360  $xref_id_to_display{$xref_id} = $display;
361  if($level < $best_level){
362  %ODN = ();
363  $ODN{$xref_id} = 1;
364  $best_level = $level;
365  }
366  elsif($level == $best_level){
367  $ODN{$xref_id} = 1;
368  }
369  }
370 
371  if(($count > 1) and (scalar(keys %ODN) == 1)){ # found one that is "best" so set it and remove others
372  print "For gene ".$gene_id_to_stable_id->{$gene_id}." we have mutiple ".$dbname."'s\n";
373  ($gene_symbol, $gene_symbol_xref_id) = $self->set_the_best_odns(\%ODN, \@list, \@list_ox, \%xref_id_to_display, $dbi);
374  if(defined($gene_symbol)){
375  return $gene_symbol, $gene_symbol_xref_id;
376  }
377  }
378 
379  if(scalar(keys %ODN) == 1){ # one hgnc to this gene - perfect case :-)
380  return $xref_id_to_display{(keys %ODN)[0]}, (keys %ODN)[0];
381  }
382  if(scalar(keys %ODN) > 1){
383 
384  #if we have more than 1 xref, fail xrefs with worse % identity if we can (query or target identity whichever is greater)
385  my $identity_sth = $self->get_best_identity_sth($dbi);
386  $identity_sth->execute($dbname, $gene_id, "Gene");
387  my ($xref_id, $best_identity);
388  $identity_sth->bind_columns(\$xref_id, \$best_identity);
389  my $temp_best_identity = 0;
390  my %best_ids = ();
391 
392  while($identity_sth->fetch){
393 
394  if($best_identity > $temp_best_identity){
395  %best_ids = ();
396  $best_ids{$xref_id} = 1;
397  $temp_best_identity = $best_identity;
398  }
399  elsif($best_identity == $temp_best_identity){
400  $best_ids{$xref_id} = 1;
401  }
402  else {
403  last;
404  }
405  }
406 
407  my %best_list;
408  foreach my $xref_id (keys %ODN){
409  $best_list{$xref_id_to_display{$xref_id}} = 1;
410  }
411 
412  # check if we were able to reduce the number of xrefs based on % identity
413  if ( scalar(keys %best_ids) > 0 && scalar(keys %best_ids) < scalar(keys %ODN) ) {
414  %ODN = %best_ids;
415  print "For gene ".$gene_id_to_stable_id->{$gene_id}." we have mutiple ".$dbname."'s\n";
416  #set statuses for xrefs with worse % identity to MULTI_DELETE
417  ($gene_symbol, $gene_symbol_xref_id) = $self->set_the_best_odns(\%ODN, \@list, \@list_ox, \%xref_id_to_display, $dbi);
418  if( defined($gene_symbol) && scalar(keys %ODN == 1) ){
419  return $gene_symbol, $gene_symbol_xref_id;
420  }
421  }
422 
423  # take the name which hasn't been already assigned to another gene, if possible
424 
425  my $xref_not_used;
426  foreach my $x (keys %ODN){
427  if (!defined($official_name_used->{$x}) ) {
428  $xref_not_used = $x;
429  }
430  }
431  if ($xref_not_used) {
432  foreach my $x (keys %ODN){
433  print "\t".$xref_id_to_display{$x};
434  if ($x == $xref_not_used) {
435  print " chosen\n";
436  $gene_symbol = $xref_id_to_display{$x};
437  $gene_symbol_xref_id = $x;
438  } else {
439  print " (left as $dbname reference but not gene symbol)\n";
440  }
441  }
442 
443  } else {
444 
445  my $i=0;
446  foreach my $x (keys %ODN){
447  print "\t".$xref_id_to_display{$x};
448  if(!$i){
449  print " (chosen as first)\n";
450  $gene_symbol = $xref_id_to_display{$x};
451  $gene_symbol_xref_id = $x;
452  }
453  else{
454  print " (left as $dbname reference but not gene symbol)\n";
455  }
456  $i++;
457  }
458 
459 
460  }
461  }
462  return ($gene_symbol, $gene_symbol_xref_id);
463 }
464 
465 
466 ###########################################################
467 # Set the transcript display xrefs
468 #
469 # Use the gene symbol to create a transcript display xref
470 # Add 201 and increment.
471 ###########################################################
472 sub set_transcript_display_xrefs{
473  my ($self, $arg_ref) = @_;
474 
475  my $max_xref_id = $arg_ref->{max_xref};
476  my $max_object_xref_id = $arg_ref->{max_object};
477  my $gene_id = $arg_ref->{gene_id};
478  my $gene_symbol = $arg_ref->{gene_symbol};
479  my $desc = $arg_ref->{desc};
480  my $source_id = $arg_ref->{source_id};
481  my $xref_added = $arg_ref->{xref_added};
482  my $seen_gene = $arg_ref->{seen_gene};
483  my $gene_to_transcripts = $arg_ref->{gene_to_tran};
484  my $tran_source = $arg_ref->{tran_source};
485  my $gene_id_to_stable_id = $arg_ref->{gene_id_to_stable_id};
486  my $dbi = $arg_ref->{dbi};
487 
488 
489  # statement handles needed
490  my $ins_xref_sth = $self->get_ins_xref_sth($dbi);
491  my $ins_dep_ix_sth = $self->get_ins_dep_ix_sth($dbi);
492  my $set_tran_display_xref_sth = $self->get_set_transcript_display_xref_sth($dbi);
493  my $ins_object_xref_sth = $self->get_ins_object_xref_sth($dbi);
494 
495  if ($gene_id_to_stable_id->{$gene_id} =~ /LRG/) { return; }
496 
497  my $ext = 201;
498  if(defined($seen_gene->{$gene_symbol})){
499  $ext = $seen_gene->{$gene_symbol};
500  }
501 
502  foreach my $tran_id ( @{$gene_to_transcripts->{$gene_id}} ){
503  my $id = $gene_symbol."-".$ext;
504  if(!defined($source_id)){
505  croak "id = $id\n but NO source_id for this entry for $tran_source???\n";
506  }
507  if(!defined($xref_added->{$id.":".$source_id})){
508  $$max_xref_id++;
509  $ins_xref_sth->execute($$max_xref_id, $source_id, $id, $id, "", $desc);
510  $xref_added->{$id.":".$source_id} = $$max_xref_id;
511  }
512  $set_tran_display_xref_sth->execute($xref_added->{$id.":".$source_id}, $tran_id);
513  $$max_object_xref_id++;
514  $ins_object_xref_sth->execute($$max_object_xref_id, $tran_id, 'Transcript', $xref_added->{$id.":".$source_id},undef);
515  $ins_dep_ix_sth->execute($$max_object_xref_id, 100, 100);
516  $ext++;
517  }
518  $seen_gene->{$gene_symbol} = $ext;
519  return;
520 }
521 
522 
523 #################################################
524 # Get statement handle to retrieve what xrefs
525 # are attached to a specific ensembl_id and type
526 # for a particular source name
527 #################################################
528 sub get_dbentrie_sth{
529  my $self = shift;
530  my $dbi = shift;
531 
532 
533  my $sql =(<<"SQ1");
534 SELECT x.label, x.xref_id, ox.object_xref_id, s.priority
535  FROM xref x, object_xref ox, source s
536  WHERE x.xref_id = ox.xref_id AND
537  x.source_id = s.source_id AND
538  s.name = ? AND
539  ox.ox_status = 'DUMP_OUT' AND
540  ox.ensembl_id = ? AND
541  ox.ensembl_object_type = ?
542 SQ1
543  my $sth = $dbi->prepare($sql);
544  return $sth;
545 }
546 
547 #################################################
548 # Get statement handle to retrieve what xrefs
549 # are attached to a specific ensembl_id and type
550 # for a particular source name with description
551 #################################################
552 sub get_dbentrie_with_desc_sth{
553  my $self = shift;
554  my $dbi = shift;
555 
556 
557  my $sql =(<<"SQD");
558 SELECT x.label, x.xref_id, ox.object_xref_id, s.priority, x.description
559  FROM xref x, object_xref ox, source s
560  WHERE x.xref_id = ox.xref_id AND
561  x.source_id = s.source_id AND
562  s.name = ? AND
563  ox.ox_status = 'DUMP_OUT' AND
564  ox.ensembl_id = ? AND
565  ox.ensembl_object_type = ?
566 SQD
567  my $sth = $dbi->prepare($sql);
568  return $sth;
569 }
570 
571 #################################################
572 # Get statement handle to retrieve average of query
573 # and target identity for xrefs
574 #################################################
575 sub get_best_identity_sth{
576  my $self = shift;
577  my $dbi = shift;
578 
579  my $sql =(<<"SQD");
580 SELECT x.xref_id, CASE WHEN ix.query_identity >= ix.target_identity
581 THEN ix.query_identity ELSE ix.target_identity END as best_identity
582 FROM xref x, object_xref ox, identity_xref ix, source s
583 WHERE x.xref_id = ox.xref_id AND x.source_id = s.source_id
584  AND ox.object_xref_id = ix.object_xref_id AND s.name = ?
585  AND ox.ox_status = 'DUMP_OUT' AND ox.ensembl_id = ?
586  AND ox.ensembl_object_type = ? order by best_identity DESC
587 SQD
588  my $sth = $dbi->prepare($sql);
589  return $sth;
590 }
591 
592 
593 #################################################
594 # Get statement handle to set the display xref
595 # for a transcript in the xref database.
596 # Stored in the transcript_stable_id table.
597 #################################################
598 sub get_set_transcript_display_xref_sth {
599  my $self = shift;
600  my $dbi = shift;
601  my $sth = $dbi->prepare('UPDATE transcript_stable_id SET display_xref_id =? where internal_id = ?');
602  return $sth;
603 }
604 
605 
606 #################################################
607 # Get statement handle to set the display xref
608 # for a gene in the xref database.
609 # Stored in the gene_stable_id table.
610 #################################################
611 sub get_set_gene_display_xref_sth {
612  my $self = shift;
613  my $dbi = shift;
614  my $sth = $dbi->prepare('UPDATE gene_stable_id SET display_xref_id =? where internal_id = ?');
615  return $sth;
616 }
617 
618 
619 ###############################################
620 # Get statement handle to insert an xref
621 ###############################################
622 sub get_ins_xref_sth{
623  my $self= shift;
624  my $dbi = shift;
625 
626  my $sql = "insert ignore into xref (xref_id, source_id, accession, label, version, species_id, info_type, info_text, description) values (?, ?, ?, ?, 0, ".$self->species_id.", 'MISC', ?, ? )";
627  my $sth = $dbi->prepare($sql);
628  return $sth;
629 }
630 
631 
632 #################################################
633 # Get statement handle to insert an identity xref
634 #################################################
635 sub get_ins_dep_ix_sth{
636  my $self= shift;
637  my $dbi = shift;
638 
639  my $sql = "insert into identity_xref (object_xref_id, query_identity, target_identity) values(?, ?, ?)";
640  my $sth = $dbi->prepare($sql);
641  return $sth;
642 }
643 
644 ###############################################
645 # Get statement handle to insert an object_xref
646 ###############################################
647 sub get_ins_object_xref_sth{
648  my $self= shift;
649  my $dbi = shift;
650 
651  my $sql = "insert into object_xref (object_xref_id, ensembl_id, ensembl_object_type, xref_id, linkage_type, ox_status, unused_priority) values (?, ?, ?, ?, 'MISC', 'DUMP_OUT', ?)";
652  my $sth = $dbi->prepare($sql);
653  return $sth;
654 }
655 
656 
657 
658 sub find_max_ids{
659  my $self = shift;
660  my $dbi = shift;
661 
662  my ($max_object_xref_id, $max_object_xref_id2, $max_xref_id);
663 
664  my $sth = $dbi->prepare("SELECT MAX(object_xref_id) FROM object_xref");
665  $sth->execute();
666  $sth->bind_columns(\$max_object_xref_id);
667  $sth->fetch;
668 
669  $sth = $dbi->prepare("SELECT MAX(object_xref_id) FROM identity_xref");
670  $sth->execute();
671  $sth->bind_columns(\$max_object_xref_id2);
672  $sth->fetch;
673 
674 
675 
676  $sth = $dbi->prepare("SELECT MAX(xref_id) FROM xref");
677  $sth->execute();
678  $sth->bind_columns(\$max_xref_id);
679  $sth->fetch;
680 
681  print "MAX xref_id = $max_xref_id MAX object_xref_id = $max_object_xref_id, max_object_xref from identity_xref = $max_object_xref_id2\n";
682  return $max_object_xref_id, $max_xref_id;
683 }
684 
685 sub get_synonyms{
686  my ($self, $synonym, $dbi) = @_;
687 
688  my $dbname = $self->get_official_name();
689 
690  my $syn_sql = (<<"SYN");
691 SELECT es.synonym, x.label
692  FROM synonym es, xref x, source s
693  WHERE x.xref_id = es.xref_id AND
694  x.source_id = s.source_id AND
695  s.name = '$dbname'
696 SYN
697 
698  my $sth = $dbi->prepare($syn_sql);
699  $sth->execute();
700  my ($syn, $name);
701  $sth->bind_columns(\$syn,\$name);
702  while($sth->fetch){
703  $synonym->{$syn} = $name;
704  }
705  $sth->finish;
706  return;
707 }
708 
709 sub get_display_label_data{
710 # my ($self, $label_to_id, $label_to_desc) = @_;
711  my ($self, $label_to_desc, $dbi) = @_;
712 
713  my $dbname = $self->get_official_name();
714 
715  my $gd1_sql = (<<"GD1");
716 SELECT x.accession, sy.synonym, x.description
717  FROM synonym sy, xref x, source so
718  WHERE x.xref_id = sy.xref_id AND
719  so.source_id = x.source_id AND
720  so.name like '$dbname'
721 GD1
722 
723  my $gd1_sth = $dbi->prepare($gd1_sql);
724 
725  $gd1_sth->execute();
726  my ($display_label, $acc, $syn, $desc);
727  $gd1_sth->bind_columns(\$acc,\$display_label, \$desc);
728  while($gd1_sth->fetch){
729 # $label_to_id->{$display_label} = $acc;
730  $label_to_desc->{$display_label} = $desc;
731  }
732  $gd1_sth->finish;
733 
734 
735 
736  # get label to id from xref database to start with.
737  my $gd2_sql = (<<"GD2");
738 SELECT x.accession, x.label, x.description
739  FROM xref x, source s
740  WHERE s.source_id = x.source_id AND
741  s.name like '$dbname'
742 GD2
743 
744  my $gd2_sth = $dbi->prepare($gd2_sql);
745 
746  $gd2_sth->execute();
747  $gd2_sth->bind_columns(\$acc,\$display_label, \$desc);
748  while($gd2_sth->fetch){
749 # $label_to_id->{$display_label} = $acc;
750  if(!defined($desc)){
751  warn "undef desc for $display_label\n";
752  }
753  else{
754  $label_to_desc->{$display_label} = $desc;
755  }
756  }
757  $gd2_sth->finish;
758  return;
759 }
760 
761 sub get_other_name_hash{
762  my $self = shift;
763 
764  if(!defined($self->{'_other_name'})){
765  my %hash;
766  $self->{'_other_name'} = \%hash;
767  }
768  return $self->{'_other_name'};
769 }
770 
771 
772 
773 
774 sub find_from_other_sources{
775  my ($self, $ignore_object, $ref_args) = @_;
776  my $tran_source = $ref_args->{tran_source};
777  my $gene_id = $ref_args->{gene_id};
778  my $display_label_to_desc = $ref_args->{label_to_desc};
779  my $dbi = $ref_args->{dbi};
780  my %ignore_object = %{$ignore_object};
781 
782  my ($gene_symbol, $gene_symbol_xref_id);
783  my $dbentrie_sth = $self->get_dbentrie_with_desc_sth($dbi);
784  my $other_name_num = $self->get_other_name_hash();
785 
786  my ($display, $xref_id, $object_xref_id, $level, $desc);
787  my %found_gene;
788  foreach my $ext_db_name (qw(miRBase RFAM EntrezGene)){
789  $dbentrie_sth->execute($ext_db_name, $gene_id, "Gene");
790  $dbentrie_sth->bind_columns(\$display, \$xref_id, \$object_xref_id, \$level, \$desc);
791  while($dbentrie_sth->fetch){
792  if (defined $found_gene{$gene_id}) {
793  last;
794  }
795  if ($display =~ /^LOC/ || $display =~ /^SSC/) {
796  next;
797  }
798  if (defined $ignore_object{$object_xref_id}) {
799  next;
800  }
801  $gene_symbol = $display;
802  $gene_symbol_xref_id = $xref_id;
803  $$tran_source = $ext_db_name;
804  $display_label_to_desc->{$display} = $desc;
805  if(defined($other_name_num->{$gene_symbol})){
806  $other_name_num->{$gene_symbol}++;
807  }
808  else{
809  $other_name_num->{$gene_symbol} = 1;
810  }
811  if ($ext_db_name eq 'miRBase' || $ext_db_name eq 'RFAM') {
812  $gene_symbol .= ".".$other_name_num->{$gene_symbol};
813  }
814  $found_gene{$gene_id} = 1;
815  next;
816  }
817  }
818  return ($gene_symbol, $gene_symbol_xref_id);
819 }
820 
821 
822 #
823 # We do not delete this but set the status to "MULTI_DELETE"
824 #
825 
826 sub get_delete_odn_sth{
827  my ($self, $dbi) = @_;
828 
829  my $sth = $dbi->prepare('UPDATE object_xref SET ox_status = "MULTI_DELETE" where object_xref_id = ?');
830  return $sth;
831 }
832 
833 sub set_the_best_odns{
834  my ($self, $odn, $ref_list, $ref_list_ox, $ref_xref_id_to_display, $dbi) = @_;
835 
836  my $delete_odn_sth = $self->get_delete_odn_sth($dbi);
837  my %ODN = %$odn;
838 
839  my $gene_symbol = undef;
840  my $gene_symbol_xref_id = undef;
841  my $i=0;
842  while ($i < scalar(@{$ref_list})){
843  my $x = $ref_list->[$i];
844  if(!exists($ODN{$x})){
845  print "\tremoving ".$ref_xref_id_to_display->{$x}." from gene\n";
846  #remove object xref....
847  $delete_odn_sth->execute($ref_list_ox->[$i])||
848  croak "Could not set staus to MULTI_DELETE for object_xref ".$ref_list_ox->[$i]."\n";
849  }
850  else{
851  print "\tKeeping the best one ".$ref_xref_id_to_display->{$x}."\n";
852  $gene_symbol = $ref_xref_id_to_display->{$x};
853  $gene_symbol_xref_id = $x;
854  }
855  $i++;
856  }
857  return ($gene_symbol, $gene_symbol_xref_id);
858 }
859 
860 ########################## START LRG BIT ######################################################
861 
862 sub get_lrg_find_sth{
863  my $self = shift;
864  my $dbi = shift;
865 
866  my $sql=(<<'SQ2');
867 SELECT x.label, x.xref_id, ox.object_xref_id, s.priority
868  FROM xref x, object_xref ox, source s
869  WHERE x.xref_id = ox.xref_id AND
870  x.source_id = s.source_id AND
871  s.name = ? AND
872  ox.ensembl_id = ? AND
873  ox.ensembl_object_type = ?
874 SQ2
875  my $sth = $dbi->prepare($sql);
876  return $sth;
877 }
878 
879 
880 sub get_lrg_set_status_sth{
881  my $self = shift;
882  my $dbi = shift;
883 
884  my $sth = $dbi->prepare("update object_xref set ox_status = 'NO_DISPLAY' where object_xref_id = ?");
885  return $sth;
886 }
887 
888 sub get_lrg_to_hgnc_sth{
889  my $self = shift;
890  my $dbi = shift;
891 
892  my $sql=(<<'SQ4');
893 SELECT x.xref_id, s.priority
894  FROM xref x,source s, object_xref ox
895  WHERE x.xref_id = ox.xref_id AND
896  x.source_id = s.source_id AND
897  x.label = ? AND
898  s.name = ? AND
899  ox.ox_status = 'DUMP_OUT'
900  ORDER BY s.priority
901 SQ4
902  my $sth = $dbi->prepare($sql);
903  return $sth;
904 }
905 
906 
907 sub find_lrg_hgnc{
908  my ($self, $gene_id, $dbi) =@_;
909  my $gene_symbol;
910  my $gene_symbol_xref_id;
911  my $is_lrg = 0;
912 
913  my $lrg_find_sth = $self->get_lrg_find_sth($dbi);
914  my $lrg_set_status_sth = $self->get_lrg_set_status_sth($dbi);
915  my $lrg_to_hgnc_sth = $self->get_lrg_to_hgnc_sth($dbi);
916 
917  # look for LRG_HGNC_notransfer, if found then find HGNC equiv and set to this
918  # print "LRG FOUND with no HGNC, should have gotten this via the alt allele table?? gene_id = $gene_id\n";
919  $lrg_find_sth->execute("LRG_HGNC_notransfer", $gene_id, "Gene");
920  my ($display, $xref_id, $object_xref_id, $level);
921  $lrg_find_sth->bind_columns(\$display, \$xref_id, \$object_xref_id, \$level);
922  while($lrg_find_sth->fetch){
923  $lrg_set_status_sth->execute($object_xref_id); # set oc_status to no _display as we do not want this transferred,
924  # just the equivalent hgnc
925  my $new_xref_id = undef;
926  my $pp;
927  $lrg_to_hgnc_sth->execute($display,"HGNC");
928  $lrg_to_hgnc_sth->bind_columns(\$new_xref_id,\$pp);
929  $lrg_to_hgnc_sth->fetch;
930  if(defined($new_xref_id)){
931  $gene_symbol = $display;
932  $gene_symbol_xref_id = $new_xref_id;
933  $is_lrg = 1;
934  }
935  }
936  return ($gene_symbol, $gene_symbol_xref_id, $is_lrg);
937 }
938 
939 #############################END LRG BIT ################################################
940 
941 #
942 # These are the ones added by official naming and hence
943 # Need to be removed incase they still exist from a previous run
944 #
945 sub get_new_dbname_sources{
946  my $self = shift;
947  my $dbi = shift;
948 
949  my %dbname_to_source_id;
950 
951  my $dbname = $self->get_official_name();
952 
953  my @list = qw(
954 Clone_based_ensembl_gene
955 Clone_based_ensembl_transcript
956 RFAM_trans_name
957 miRBase_trans_name
958 EntrezGene_trans_name);
959 
960  push @list, $dbname."_trans_name";
961  push @list, $dbname;
962 
963  my $sth = $dbi->prepare("select source_id from source where name like ?");
964 
965  my $source_error = 0;
966  foreach my $source (@list){
967  my $id = undef;
968  $sth->execute($source);
969  $sth->bind_columns(\$id);
970  $sth->fetch();
971  if(!defined($id)){
972  warn "Could not find external database name $source\n";
973  $source_error++;
974  }
975  else{
976  $dbname_to_source_id{$source} = $id;
977  }
978  }
979  if($source_error){
980  carp "Could not find name for $source_error database name.\nTherefore Exiting.\nPlease add these sources";
981  }
982  return \%dbname_to_source_id;
983 }
984 
985 sub delete_old_data{
986  my ($self, $dbname_to_source_id, $dbi) = @_;
987 
988  my $dbname = $self->get_official_name();
989 
990  my @sources = qw(
991 Clone_based_ensembl_gene
992 Clone_based_ensembl_transcript
993 EntrezGene_trans_name
994 RFAM_trans_name
995 miRBase_trans_name);
996 
997  push @sources, $dbname."_trans_name";
998 
999  my @source_ids = map {$dbname_to_source_id->{$_}} @sources;
1000  my $list = join(", ",@source_ids);
1001 
1002 
1003  print "LIST to delete $list\n";
1004 
1005 
1006  my $sql =(<<"DE1");
1007 DELETE s
1008  FROM synonym s, xref x
1009  WHERE s.xref_id = x.xref_id AND
1010  x.source_id in ( $list );
1011 DE1
1012 
1013  my $sth = $dbi->prepare($sql);
1014  $sth->execute();
1015 
1016 
1017  my $del_identity_sql =(<<"DE2");
1018 DELETE i
1019  FROM object_xref o, xref x, identity_xref i
1020  WHERE i.object_xref_id = o.object_xref_id AND
1021  x.xref_id = o.xref_id AND
1022  x.source_id in ( $list )
1023 DE2
1024  $sth = $dbi->prepare($del_identity_sql);
1025  $sth->execute();
1026 
1027  my $del_ox_sql = (<<"DE3");
1028 DELETE o
1029  FROM object_xref o, xref x
1030  WHERE x.xref_id = o.xref_id AND
1031  x.source_id in ( $list )
1032 DE3
1033  $sth = $dbi->prepare($del_ox_sql);
1034  $sth->execute();
1035 
1036  my $del_x_sql = "delete x from xref x where x.source_id in ( $list )";
1037 
1038  $sth = $dbi->prepare($del_x_sql);
1039  $sth->execute();
1040  return;
1041 }
1042 
1043 
1044 sub reset_display_xrefs{
1045  my $self = shift;
1046  my $dbi = shift;
1047 
1048  my $sth = $dbi->prepare("update transcript_stable_id set display_xref_id = null");
1049  $sth->execute;
1050 
1051  $sth = $self->xref->dbc->prepare("UPDATE gene_stable_id SET display_xref_id = null, desc_set =0");
1052  $sth->execute;
1053 
1054  return;
1055 }
1056 
1057 1;
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
map
public map()
XrefMapper::BasicMapper
Definition: BasicMapper.pm:8
run
public run()