ensembl-hive  2.7.0
BasicMapper.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 package XrefMapper::BasicMapper;
21 
22 use strict;
23 use warnings;
24 use Carp;
25 use Cwd;
26 use File::Basename;
27 use IPC::Open3;
28 
29 use XrefMapper::db;
31 
32 =head2 new
33 
34  Description: Constructor for BasicMapper.
35  Returntype : BasicMapper
36  Exceptions : none
37  Caller : general
38 
39 =cut
40 
41 sub new{
42  my($class, @args) = @_;
43 
44  my $self ={};
45  bless $self,$class;
46  return $self;
47 }
48 
49 
50 
51 =head2 xref
52 
53  Arg [1] : (optional)
54  Example : $mapper->core($new_core);
55  Description: Getter / Setter for the core.
56  info for the xref database.
57  Returntype : XrefMapper::db
58  Exceptions : none
59 
60 =cut
61 
62 sub xref{
63  my ($self, $arg) = @_;
64 
65  (defined $arg) &&
66  ($self->{_xref} = $arg );
67  return $self->{_xref};
68 }
69 
70 =head2 farm_queue
71 
72  Arg [1] : (optional)
73  Example : $mapper->farm_queue("long");
74  Description: Getter / Setter for the farm queue.
75  Returntype : string
76  Exceptions : none
77 
78 =cut
79 
80 sub farm_queue{
81  my ($self, $arg) = @_;
82 
83  (defined $arg) &&
84  ($self->{_queue} = $arg );
85  return $self->{_queue};
86 }
87 
88 =head2 exonerate
89 
90  Arg [1] : (optional)
91  Example : $mapper->exonerate("/usr/local/exonerate1.1.1");
92  Description: Getter / Setter for the exonerate executable with full path.
93  Returntype : string
94  Exceptions : none
95 
96 =cut
97 
98 sub exonerate{
99  my ($self, $arg) = @_;
100 
101  (defined $arg) &&
102  ($self->{_exonerate} = $arg );
103  return $self->{_exonerate};
104 }
105 
106 =head2 core
107 
108  Arg [1] : (optional)
109  Example : $mapper->core($new_core);
110  Description: Getter / Setter for the core.
111  info for the ensembl core database.
112  Returntype : XrefMapper::db
113  Exceptions : none
114 
115 =cut
116 
117 sub core{
118  my ($self, $arg) = @_;
119 
120  (defined $arg) &&
121  ($self->{_core} = $arg );
122  return $self->{_core};
123 }
124 
125 =head2 previous_core
126 
127  Arg [1] : (optional)
128  Example : $mapper->previous_core($old_core);
129  Description: Getter / Setter for the previous release of the core db.
130  Returntype : XrefMapper::db
131  Exceptions : none
132 
133 =cut
134 
135 sub previous_core{
136  my ($self, $arg) = @_;
137 
138  (defined $arg) &&
139  ($self->{_previous_core} = $arg );
140  return $self->{_previous_core};
141 }
142 
143 
144 =head2 add_meta_pair
145 
146  Arg [1] : key
147  Arg [2] : value
148  Example : $mapper->add_meta_pair("head_directory","/lustre/src/");
149  Description: Adds key value pairs to the database
150  Returntype : none
151  Exceptions : none
152 
153 =cut
154 
155 sub add_meta_pair {
156 
157  my ($self, $key, $value) = @_;
158 
159  my $sth = $self->xref->dbc->prepare('insert into meta (meta_key, meta_value, date) values("'.$key.'", "'.$value.'", now())');
160  $sth->execute;
161  $sth->finish;
162  return;
163 }
164 
165 sub update_process_status{
166  my ($self, $value) = @_;
167 
168  my $sth_stat = $self->xref->dbc->prepare("insert into process_status (status, date) values('".$value."',now())");
169  $sth_stat->execute();
170  $sth_stat->finish;
171  return;
172 }
173 
174 sub xref_latest_status {
175  my $self = shift;
176  my $verbose = shift || 0;
177 
178  my $sth = $self->xref->dbc->prepare("select id, status, date from process_status order by id");
179 
180  $sth->execute();
181  my ($id, $status, $date);
182  $sth->bind_columns(\$id, \$status,\$date);
183  while($sth->fetch){
184  print "$status\t$date\n" if($verbose and $self->verbose);
185  }
186  return $status;
187 
188 }
189 
190 sub get_meta_value {
191  my ($self, $key) = @_;
192 
193  my $sth = $self->xref->dbc->prepare('select meta_value from meta where meta_key like "'.$key.'" order by meta_id');
194 
195  $sth->execute();
196  my $value;
197  $sth->bind_columns(\$value);
198  while($sth->fetch){ # get the last one
199  }
200  $sth->finish;
201 
202  return $value;
203 }
204 
205 sub process_file {
206  my $self = shift;
207  my $file = shift;
208  my $verbose = shift;
209 
210 
211  my $xref=undef;
212  my $ensembl=undef;
213  my $type;
214 
215  my %xref_hash=();
216  my %species_hash=();
217  my %farm_hash=();
218 
219  open my $fh, "<", $file or croak ("\nCannot open input file '$file':\n $!\n");
220  while( my $line = <$fh> ) {
221 
222  chomp($line);
223  next if $line =~ /^#/;
224  next if !$line;
225 
226  my ($key, $value) = split("=",$line);
227  $value =~ s/^\s*// if defined $value;
228  $value =~ s/\s*$// if defined $value;
229  $key =~ s/^\s*// if defined $key;
230  $key =~ s/\s*$// if defined $key;
231  if($key eq "species"){
232  $type = "species";
233  $species_hash{'species'} = $value;
234  }
235  elsif($key eq "xref"){
236  $type = "xref";
237  }
238  elsif($key eq "farm"){
239  $type = "farm";
240  }
241  elsif($type eq "species"){ # processing species data
242  $species_hash{lc($key)} = $value;
243  }
244  elsif($type eq "xref"){ # processing xref data
245  $xref_hash{lc($key)} = $value;
246  }
247  elsif($type eq "farm"){
248  $farm_hash{lc($key)} = $value;
249  }
250  }
251  close $fh or croak "Can't close file";
252 
253  my $value = $species_hash{'species'};
254  my $taxon = $species_hash{'taxon'};
255 
256  if ($value !~ /_/) {
257  print STDERR "\'$value\' is not a recognised species - please use full species name (e.g. homo_sapiens) in $file\n";
258  exit(1);
259  }
260 
261  my $use_basic = 0;
262  my $mapper;
263  my $module;
264  my $class = "XrefMapper/$value.pm";
265  my $eval_test = eval {
266  require $class;
267  };
268  if($@ or $eval_test != 1) {
269  if ($@ =~ /Can\'t locate $class/) {
270  if (defined $taxon) {
271  $class = "XrefMapper/$taxon.pm";
272  eval {
273  require $class;
274  };
275  if($@) {
276  if ($@ =~ /Can\'t locate $class/) {
277  $use_basic = 1;
278  } else { die "$@"; }
279  } else {
280  $module = $taxon;
281  }
282  }
283  else {
284  $use_basic = 1;
285  }
286  }
287  else {
288  die "$@";
289  }
290 
291  } else{
292  $module = $value;
293  }
294 
295  if ($use_basic or !defined $module) {
296  if(defined($verbose) and $verbose) {
297  my $warning_msg = "Did not find a specific mapping module XrefMapper::$value ";
298  if (defined $taxon) {
299  $warning_msg .= "or XrefMapper::$taxon ";
300  }
301  $warning_msg .= "- using XrefMapper::BasicMapper instead\n";
302  carp($warning_msg);
303  }
304  require XrefMapper::BasicMapper;
305  $module = "BasicMapper";
306  }
307 
308  $mapper = "XrefMapper::$module"->new();
309 
310  if(defined($farm_hash{'queue'})){
311  $mapper->farm_queue($farm_hash{'queue'});
312  }
313  if(defined($farm_hash{'exonerate'})){
314  $mapper->exonerate($farm_hash{'exonerate'});
315  }
316 
317 
318  if(defined($xref_hash{host}) ){
319  my ($host, $user, $dbname, $pass, $port);
320  $host = $xref_hash{'host'};
321  $user = $xref_hash{'user'};
322  $dbname = $xref_hash{'dbname'};
323  if(defined($xref_hash{'password'})){
324  $pass = $xref_hash{'password'};
325  }
326  else{
327  $pass = '';
328  }
329  if(defined($xref_hash{'port'})){
330  $port = $xref_hash{'port'};
331  }
332  else{
333  $port = 3306;
334  }
335 
336  $xref = new XrefMapper::db(-host => $host,
337  -port => $port,
338  -user => $user,
339  -pass => $pass,
340  -group => 'core',
341  -dbname => $dbname);
342 
343  $mapper->xref($xref);
344  $mapper->add_meta_pair("xref", $host.":".$dbname);
345  if(defined($xref_hash{'dir'})){
346  $xref->dir($xref_hash{'dir'});
347  if(!-d $xref_hash{'dir'}){
348  croak "directory ".$xref_hash{'dir'}." does not exist please create this\n";
349  }
350  }
351  else{
352  croak "No directory specified for the xref fasta files\n";
353  }
354 
355  }
356  else {
357  croak "No host name given for xref database\n";
358  }
359 
360  if(defined($species_hash{'species'})){
361 
362  my ($host, $port, $user, $dbname, $pass);
363  $host = $species_hash{'host'};
364  $user = $species_hash{'user'};
365  $dbname = $species_hash{'dbname'};
366  if(defined($species_hash{'password'})){
367  $pass = $species_hash{'password'};
368  }
369  else{
370  $pass = '';
371  }
372  if(defined($species_hash{'port'})){
373  $port = $species_hash{'port'};
374  }
375  else{
376  $port = '';
377  }
378 
379  my $core = new XrefMapper::db(-host => $host,
380  -port => $port,
381  -user => $user,
382  -pass => $pass,
383  -group => 'core',
384  -dbname => $dbname);
385 
386  $mapper->core($core);
387 
388  $mapper->add_meta_pair("species", $host.":".$dbname);
389 
390  if(defined($species_hash{'dir'})){
391  $core->dir($species_hash{'dir'});
392  if(!-d $species_hash{'dir'}){
393  croak "directory ".$species_hash{'dir'}." does not exist please create this\n";
394  }
395  }
396  else{
397  croak "No directory specified for the ensembl fasta files\n";
398  }
399 
400  $core->species($value);
401 
402  #connect to previous release of core db if connection details specified in xref_input (pr_host, pr_port, pr_dbname, pr_user)
403  if (defined( $species_hash{'pr_host'}) && defined( $species_hash{'pr_user'}) && defined( $species_hash{'pr_dbname'}) ) {
404  my ($pr_host, $pr_port, $pr_user, $pr_dbname);
405  $pr_host = $species_hash{'pr_host'};
406  $pr_user = $species_hash{'pr_user'};
407  $pr_dbname = $species_hash{'pr_dbname'};
408  if(defined($species_hash{'pr_port'})){
409  $pr_port = $species_hash{'pr_port'};
410  }
411 
412  my $previous_core = new XrefMapper::db(-host => $pr_host,
413  -port => $pr_port,
414  -user => $pr_user,
415  -pass => '',
416  -group => 'core',
417  -dbname => $pr_dbname);
418 
419  $mapper->previous_core($previous_core);
420 
421  $mapper->add_meta_pair("species", $pr_host.":".$pr_dbname);
422 
423  }
424  }
425 
426  return $mapper;
427 }
428 
429 
430 =head2 dumpcheck
431 
432  Arg [1] : (optional)
433  Example : $mapper->dumpcheck("yes");
434  Description: Getter / Setter for dumpcheck.
435  If set the mapper will not dump fasta files
436  if they exist already.
437  Returntype : scalar
438  Exceptions : none
439 
440 =cut
441 
442 sub dumpcheck {
443  my ($self, $arg) = @_;
444 
445  (defined $arg) &&
446  ($self->{_dumpcheck} = $arg );
447  return $self->{_dumpcheck};
448 }
449 
450 sub nofarm {
451  my ($self, $arg) = @_;
452 
453  (defined $arg) &&
454  ($self->{_nofarm} = $arg );
455  return $self->{_nofarm};
456 }
457 
458 sub verbose {
459  my ($self, $arg) = @_;
460 
461  (defined $arg) &&
462  ($self->{_verbose} = $arg );
463  return $self->{_verbose};
464 }
465 
466 sub species_id {
467  my ($self, $arg) = @_;
468 
469  (defined $arg) &&
470  ($self->{_species_id} = $arg );
471  return $self->{_species_id};
472 }
473 
474 sub get_id_from_species_name {
475  my ($self, $species_name) = @_;
476 
477  my $sql = "select species_id from species where name = '".$species_name."'";
478  my $sth = $self->xref->dbc->prepare($sql);
479  $sth->execute();
480  my @row = $sth->fetchrow_array();
481  my $species_id;
482  if (@row) {
483  $species_id = $row[0];
484  } else {
485  print STDERR "Couldn't get ID for species ".$species_name."\n";
486  print STDERR "It must be one of :-\n";
487  $sql = "select name from species";
488  $sth = $self->xref->dbc->prepare($sql);
489  $sth->execute();
490  while(my @row2 = $sth->fetchrow_array()){
491  print STDERR $row2[0]."\n";
492  }
493  }
494  $sth->finish();
495 
496  return $species_id;
497 
498 
499 }
500 
501 #
502 # Alt alleles
503 #
504 
505 sub get_alt_alleles {
506  my $self = shift;
507 
508  my $dba = $self->core->dba;
509  my $aaga = Bio::EnsEMBL::DBSQL::AltAlleleGroupAdaptor->new($dba);
510 
511  my $aa_list = $aaga->fetch_all();
512 
513  my $count = scalar(@$aa_list);
514  my %alt_id_to_gene_id;
515  my %gene_id_to_alt_id;
516  my $max_alt_id = 0;
517  my %is_reference;
518  my $sth;
519  my $insert_sth = $self->xref->dbc->prepare("insert into alt_allele (alt_allele_id, gene_id, is_reference) values (?, ?,?)");
520 
521  if($count){
522  $sth = $self->xref->dbc->prepare("delete from alt_allele");
523  $sth->execute;
524  my $alt_added = 0;
525  my $num_of_genes = 0;
526 
527  # Iterate through all alt-allele groups, pushing unique alleles into the xref alt allele table.
528  # Track the reference gene IDs.
529 
530  foreach my $aag (@$aa_list) {
531  my $ref_gene = $aag->rep_Gene_id();
532  # Representative gene not guaranteed, try to find an alternative best fit
533  if (!$ref_gene) {
534  my $genes = $aag->get_all_Genes;
535  foreach my $gene (@$genes) {
536  if ($gene->slice->is_reference) {
537  $ref_gene = $gene->dbID;
538  }
539  }
540  }
541  if (!$ref_gene) {
542  warn('Tried very hard but failed to select a representative gene for alt-allele-group '.$aag->dbID);
543  next;
544  }
545  $is_reference{$ref_gene} = 1;
546  my $others = $aag->get_all_Gene_ids('no rep');
547  # Extra step in place to handle non-ref situations
548  my @cleaned_others = grep {!/$ref_gene/} @$others;
549 
550  $insert_sth->execute($aag->dbID,$ref_gene,1);
551  $num_of_genes++;
552  $alt_added++;
553  foreach my $aa (@cleaned_others) {
554  $insert_sth->execute($aag->dbID,$aa,0);
555  $num_of_genes++;
556  }
557 
558  if ($aag->dbID > $max_alt_id) { $max_alt_id = $aag->dbID }
559  }
560 
561  print "$alt_added alleles found containing $num_of_genes genes\n";
562  }
563  else{
564  print "No alt_alleles found for this species.\n" ;
565  }
566 
567 
568  ### LRGs added as alt_alleles in the XREF system but never added to core.
569 
570  #
571  # Use $max_alt_id for new ones.
572  #
573 
574  my $sql =(<<'LRG');
575 SELECT ox.ensembl_id, g.gene_id
576  FROM xref x, object_xref ox, external_db e, gene g
577  WHERE x.xref_id = ox.xref_id AND
578  e.external_db_id = x.external_db_id AND
579  e.db_name like "Ens_Hs_gene" AND
580  ox.ensembl_object_type = "Gene" AND
581  x.display_label = g.stable_id
582 LRG
583 
584  $sth = $self->core->dbc->prepare($sql);
585  my ($core_gene_id, $lrg_gene_id);
586  $sth->execute();
587  $sth->bind_columns(\$lrg_gene_id, \$core_gene_id);
588 
589  $count =0;
590 
591  my $old_count = 0;
592  my $new_count = 0;
593  my $lrg_count = 0;
594  #
595  # If the core gene is already in an alt_allele set then use that alt_id for the LRG gene only.
596  # Else use a new one and add both core and LRG.
597  #
598 
599 
600  while ($sth->fetch()){
601  my $aag = $aaga->fetch_by_gene_id($core_gene_id);
602  if ($aag) {
603  $insert_sth->execute($aag->dbID, $lrg_gene_id, 0);
604  $old_count++;
605  } else {
606  $aag = $aaga->fetch_by_gene_id($lrg_gene_id);
607  if ($aag) {
608  $insert_sth->execute($aag->dbID, $lrg_gene_id, 1);
609  print "LRG perculiarity\t$core_gene_id\t$lrg_gene_id\n";
610  $lrg_count++;
611  } else {
612  $max_alt_id++;
613  $insert_sth->execute($max_alt_id, $lrg_gene_id, 0);
614  $insert_sth->execute($max_alt_id, $core_gene_id, 1);
615  $new_count++;
616  }
617  }
618  $count++;
619  }
620 
621 
622  if($count){
623  print "Added $count alt_allels for the lrgs. $old_count added to previous alt_alleles and $new_count new ones\n";
624  print "LRG problem count = $lrg_count\n";
625  }
626 
627 
628  $self->update_process_status("alt_alleles_added");
629  return;
630 
631 }
632 
633 
634 #
635 # Default behaviour is not to do the offical naming
636 # Overload this method in the species file returning the
637 # official database name to do so.
638 # (ie, human-> HGNC, mouse ->MGI, zebrafisf -> ZFIN_ID)
639 #
640 sub get_official_name {
641  return;
642 }
643 
644 
645 
646 #
647 # Biomart insists that a source is linked to only one ensembl
648 # object type (Gene, Transcript, Translation). So biomart_fix
649 # will move $dbnmae entry for type1 to type 2
650 # i.e. move all HGNC from transcripts to Genes.
651 #
652 sub biomart_fix{
653  my ($self, $db_name, $type1, $type2, $verbose, $xref_dbc) = @_;
654  $xref_dbc = $self->xref->dbc unless defined $xref_dbc;
655 
656  print "$db_name is associated with both $type1 and $type2 object types\n" if(defined($verbose));
657  print "$db_name moved to Gene level.\n" if(!defined($verbose));
658 
659  my $to;
660  my $from;
661  my $to_id;
662  my $from_id;
663  if($type1 eq "Gene" or $type2 eq "Gene"){
664  $to = "Gene";
665  $to_id = "gene_id";
666  if($type1 eq "Translation" or $type2 eq "Translation"){
667  $from = "Translation";
668  $from_id = "translation_id"
669  }
670  else{
671  $from = "Transcript";
672  $from_id = "transcript_id";
673  }
674  }
675  else{
676  $to = "Transcript";
677  $to_id = "transcript_id";
678  $from = "Translation";
679  $from_id = "translation_id";
680  }
681 
682  if ($db_name eq 'GO' || $db_name eq 'goslim_goa') {
683  $to = 'Translation';
684  $from = 'Transcript';
685  $to_id = 'translation_id';
686  $from_id = 'transcript_id';
687  }
688 
689  print "Therefore moving all associations from $from to ".$to."\n" if(defined($verbose));
690 
691 
692  my $sql =(<<"EOF");
693  UPDATE IGNORE object_xref, gene_transcript_translation, xref, source
694  SET object_xref.ensembl_object_type = "$to",
695  object_xref.ensembl_id = gene_transcript_translation.$to_id
696  WHERE object_xref.ensembl_object_type = "$from" AND
697  object_xref.ensembl_id = gene_transcript_translation.$from_id AND
698  xref.xref_id = object_xref.xref_id AND
699  xref.source_id = source.source_id AND
700  object_xref.ox_status = "DUMP_OUT" AND
701  source.name = "$db_name";
702 EOF
703  my $result = $xref_dbc->do($sql) ;
704 
705  if($db_name eq "GO" || $db_name eq 'goslim_goa'){
706  $sql =(<<"EOF2");
707  DELETE object_xref, identity_xref
708  FROM object_xref, xref, source, identity_xref
709  WHERE object_xref.ensembl_object_type = "$from" AND
710  identity_xref.object_xref_id = object_xref.object_xref_id AND
711  xref.xref_id = object_xref.xref_id AND
712  xref.source_id = source.source_id AND
713  object_xref.ox_status = "DUMP_OUT" AND
714  source.name = "$db_name";
715 EOF2
716 
717  $result = $xref_dbc->do($sql);
718 
719 # Special tidying up for transcripts without translation
720 # The resulting object_xref does not have an ensembl_id to map to
721 
722  $sql=(<<"EOF4");
723  DELETE object_xref, identity_xref
724  FROM object_xref, xref, source, identity_xref
725  WHERE object_xref.ensembl_object_type = "$to" AND
726  identity_xref.object_xref_id = object_xref.object_xref_id AND
727  xref.xref_id = object_xref.xref_id AND
728  xref.source_id = source.source_id AND
729  object_xref.ensembl_id = 0 AND
730  object_xref.ox_status = "DUMP_OUT" AND
731  source.name = "$db_name";
732 EOF4
733  }
734  else{
735  $sql =(<<"EOF3");
736  DELETE object_xref, identity_xref
737  FROM xref, source, object_xref
738  LEFT JOIN identity_xref
739  ON identity_xref.object_xref_id = object_xref.object_xref_id
740  WHERE object_xref.ensembl_object_type = "$from" AND
741  xref.xref_id = object_xref.xref_id AND
742  xref.source_id = source.source_id AND
743  object_xref.ox_status = "DUMP_OUT" AND
744  source.name = "$db_name";
745 EOF3
746 
747  $result = $xref_dbc->do($sql);
748  }
749 # print "\n$sql\n";
750 
751  #delete dependent_xref
752  $sql =(<<'EOF4');
753  DELETE FROM dependent_xref WHERE object_xref_id NOT IN
754  (SELECT object_xref_id FROM object_xref);
755 EOF4
756  return;
757 }
758 
759 
760 #
761 # This sub finds which source lie on multiple ensembl obejct types
762 # and calls biomart_fix to fix this.
763 #
764 sub biomart_testing{
765  my ($self) = @_;
766 
767  my $sql = 'SELECT ox.ensembl_object_type, COUNT(*), s.name FROM xref x, object_xref ox, source s WHERE x.xref_id = ox.xref_id AND s.source_id = x.source_id and ox.ox_status = "DUMP_OUT" GROUP BY s.name, ox.ensembl_object_type';
768 
769 
770  my $again = 1;
771  while ($again){
772  $again = 0;
773 
774  my $sth = $self->xref->dbc->prepare($sql);
775  $sth->execute();
776  my ($type, $count, $name);
777  my ($last_type, $last_count, $last_name);
778  $sth->bind_columns(\$type,\$count,\$name);
779  $last_name = "DEFAULT";
780  while ((!$again) and $sth->fetch){
781  if($last_name eq $name){
782  $again = 1;
783  $self->biomart_fix($name,$last_type, $type, 1);
784  }
785  $last_name = $name;
786  $last_type= $type;
787  $last_count = $count;
788  }
789  $sth->finish;
790  }
791 
792  my $tester = XrefMapper::TestMappings->new($self);
793  if($tester->unlinked_entries){
794  croak "Problems found before source_defined_move\n";
795  }
796 
797  $self->update_process_status('biomart_test_finished');
798  return;
799 }
800 
801 #
802 # Similar to above but just reports the problems.
803 # It does not fix them
804 #
805 
806 sub biomart_test{
807  my ($self) = @_;
808 
809  my $sql = 'SELECT ox.ensembl_object_type, COUNT(*), s.name FROM xref x, object_xref ox, source s WHERE x.xref_id = ox.xref_id AND s.source_id = x.source_id and ox.ox_status = "DUMP_OUT" GROUP BY s.name, ox.ensembl_object_type';
810 
811 
812  my $sth = $self->xref->dbc->prepare($sql);
813 
814  $sth->execute();
815  my ($type, $count, $name);
816  my ($last_type, $last_count, $last_name);
817  $sth->bind_columns(\$type,\$count,\$name);
818  $last_name = "NOTKNOWN";
819  my $first = 1;
820  while ($sth->fetch){
821  if($last_name eq $name){
822  if($first){
823  print STDERR "\nProblem Biomart test fails\n";
824  $first=0;
825  }
826  print STDERR "$last_name\t$last_count\t$last_type\n";
827  print STDERR "$name\t$count\t$type\n";
828  }
829  $last_name = $name;
830  $last_type= $type;
831  $last_count = $count;
832  }
833  $sth->finish;
834  return;
835 }
836 
837 # remove a list of patterns from a string
838 sub filter_by_regexp {
839 
840  my ($self, $str, $regexps) = @_;
841 
842  foreach my $regexp (@$regexps) {
843  $str =~ s/$regexp//ig;
844  }
845 
846  return $str;
847 
848 }
849 
850 
851 sub get_species_id_from_species_name{
852  my ($self,$species) = @_;
853 
854 
855  my $sql = "select species_id from species where name = '".$species."'";
856  my $sth = $self->dbc->prepare($sql);
857  $sth->execute();
858  my @row = $sth->fetchrow_array();
859  my $species_id;
860  if (@row) {
861  $species_id = $row[0];
862  } else {
863  print STDERR "Couldn't get ID for species ".$species."\n";
864  print STDERR "It must be one of :-\n";
865  $sql = "select name from species";
866  $sth = $self->dbc->prepare($sql);
867  $sth->execute();
868  while(my @row2 = $sth->fetchrow_array()){
869  print STDERR $row2[0]."\n";
870  }
871  }
872  $sth->finish();
873 
874  return $species_id;
875 }
876 
877 
878 sub clean_up{
879  my $self = shift;
880  my $stats = shift;
881  my $keep_core_data = shift;
882 
883  # remove all object_xref, identity_xref entries
884 
885  my $sql = "TRUNCATE table object_xref";
886  my $sth = $self->xref->dbc->prepare($sql);
887  $sth->execute();
888 
889  $sql = "TRUNCATE table identity_xref";
890  $sth = $self->xref->dbc->prepare($sql);
891  $sth->execute();
892 
893  # remove all xrefs after PARSED_xref_id
894  # set dumped to NULL fro all xrefs.
895 
896  my $max_xref_id = $self->get_meta_value("PARSED_xref_id");
897 
898  if($max_xref_id){
899  $sql = "DELETE from xref where xref_id > $max_xref_id";
900  $sth = $self->xref->dbc->prepare($sql);
901  $sth->execute();
902  }
903 
904  $sql = "UPDATE xref set dumped = null";
905  $sth = $self->xref->dbc->prepare($sql);
906  $sth->execute();
907 
908  $sql = "DELETE from display_xref_priority";
909  $sth = $self->xref->dbc->prepare($sql);
910  $sth->execute();
911 
912 
913  $sql = "DELETE from gene_desc_priority";
914  $sth = $self->xref->dbc->prepare($sql);
915  $sth->execute();
916 
917 
918  if (!$keep_core_data) {
919  # remove all from core_info tables
920  # gene_transcript_translation
921  # [gene/transcript/translation]_stable_id
922  #
923  $sql = "DELETE from gene_transcript_translation";
924  $sth = $self->xref->dbc->prepare($sql);
925  $sth->execute();
926 
927  $sql = "DELETE from gene_stable_id";
928  $sth = $self->xref->dbc->prepare($sql);
929  $sth->execute();
930 
931  $sql = "DELETE from transcript_stable_id";
932  $sth = $self->xref->dbc->prepare($sql);
933  $sth->execute();
934 
935  $sql = "DELETE from translation_stable_id";
936  $sth = $self->xref->dbc->prepare($sql);
937  $sth->execute();
938  }
939  return;
940 }
941 
942 sub remove_mapping_data{
943  my $self = shift;
944 
945  my $sql = "DELETE from mapping_jobs";
946  my $sth = $self->xref->dbc->prepare($sql);
947  $sth->execute();
948 
949  $sql = "DELETE from mapping";
950  $sth = $self->xref->dbc->prepare($sql);
951  $sth->execute();
952 
953  $sql = "DELETE from alt_allele";
954  $sth = $self->xref->dbc->prepare($sql);
955  $sth->execute();
956 
957  $sql = "DELETE from source_mapping_method";
958  $sth = $self->xref->dbc->prepare($sql);
959  $sth->execute();
960 
961  return;
962 }
963 
964 
965 sub revert_to_parsing_finished{
966  my $self = shift;
967 
968 
969  $self->clean_up();
970  $self->remove_mapping_data();
971 
972  $self->update_process_status('parsing_finished');
973 
974  return;
975 }
976 
977 
978 sub revert_to_mapping_finished{
979  my $self = shift;
980 
981 
982  $self->clean_up(undef,1);
983 
984  # set mapping jobs to SUBMITTED
985  my $sql = 'UPDATE mapping_jobs set status = "SUBMITTED"';;
986  my $sth = $self->xref->dbc->prepare($sql);
987  $sth->execute();
988 
989  $self->update_process_status('mapping_finished');
990  return;
991 }
992 
993 #
994 # In case we have alt alleles with xefs, these will be direct ones
995 # we need to move all xrefs on to the reference
996 #
997 
998 sub get_alt_allele_hashes{
999  my $self= shift;
1000 
1001  my %alt_to_ref;
1002  my %ref_to_alts;
1003 
1004  my $sql = "select alt_allele_id, gene_id, is_reference from alt_allele order by alt_allele_id, is_reference DESC";
1005 
1006  my $sth = $self->xref->dbc->prepare($sql);
1007  $sth->execute();
1008  my ($alt_allele_id,$gene_id, $is_ref);
1009  $sth->bind_columns(\$alt_allele_id, \$gene_id, \$is_ref);
1010  my $last_alt_allele = 0;
1011  my $ref_gene;
1012  while($sth->fetch()){
1013  if( $alt_allele_id != $last_alt_allele) {
1014  #use the first non-reference gene if there is no reference gene in an alt_allele
1015  $ref_gene = $gene_id;
1016  } else{
1017  $alt_to_ref{$gene_id} = $ref_gene;
1018  push @{$ref_to_alts{$ref_gene}}, $gene_id;
1019  }
1020  $last_alt_allele = $alt_allele_id;
1021  }
1022  $sth->finish;
1023 
1024  return \%alt_to_ref, \%ref_to_alts;
1025 }
1026 
1027 
1028 sub process_alt_alleles{
1029  my $self = shift;
1030  my $dbc = shift;
1031  $dbc = $self->xref->dbc unless defined $dbc;
1032 
1033  # ALL are on the Gene level now. This may change but for now it is okay.
1034  my ($alt_to_ref, $ref_to_alts) = $self->get_alt_allele_hashes();
1035 
1036  my $tester = XrefMapper::TestMappings->new($self);
1037  #
1038  # Move the xrefs on to the reference Gene.
1039  # NOTE: Igonore used as the xref might already be on this Gene already and we do not want it to crash
1040  #
1041  my $move_sql =(<<'MOVE');
1042 UPDATE IGNORE object_xref ox, xref x, source s
1043  SET ox.ensembl_id = ?
1044  WHERE x.source_id = s.source_id AND
1045  ox.xref_id = x.xref_id AND
1046  ox.ensembl_id = ? AND
1047  ox.ensembl_object_type = 'Gene' AND
1048  ox.ox_status = 'DUMP_OUT' AND
1049  s.name in (
1050 MOVE
1051 $move_sql .= "'".join("', '",$self->get_gene_specific_list()) . "')";
1052 
1053 print "MOVE SQL\n$move_sql\n";
1054 
1055  #
1056  # Now where it was already on the Gene the ignore will have stopped the move
1057  # so we now want to just remove those ones as they already exist.
1058  #
1059  my $del_ix_sql =(<<'DIX');
1060 DELETE ix
1061  FROM identity_xref ix, object_xref ox, xref x, source s
1062  WHERE x.source_id = s.source_id AND
1063  ox.object_xref_id = ix.object_xref_id AND
1064  ox.xref_id = x.xref_id AND
1065  ox.ensembl_id = ? AND
1066  ox.ensembl_object_type = 'Gene' AND
1067  ox.ox_status = 'DUMP_OUT' AND
1068  s.name in (
1069 DIX
1070 $del_ix_sql .= "'".join("', '",$self->get_gene_specific_list()) . "')";
1071 
1072  my $del_sql =(<<'DEL');
1073 DELETE ox
1074  FROM object_xref ox, xref x, source s
1075  WHERE x.source_id = s.source_id AND
1076  ox.xref_id = x.xref_id AND
1077  ox.ensembl_id = ? AND
1078  ox.ensembl_object_type = 'Gene' AND
1079  ox.ox_status = 'DUMP_OUT' AND
1080  s.name in (
1081 DEL
1082 $del_sql .= "'".join("', '",$self->get_gene_specific_list()) . "')";
1083 
1084  my $move_sth = $dbc->prepare($move_sql) || croak "$move_sql cannot be prepared";
1085  my $del_ix_sth = $dbc->prepare($del_ix_sql) || croak "$del_ix_sql cannot be prepared";
1086  my $del_sth = $dbc->prepare($del_sql) || croak "$del_sql cannot be prepared";
1087 
1088  my $move_count = 0;
1089  my $del_ix_count = 0;
1090  my $del_ox_count = 0;
1091  foreach my $key (keys %$alt_to_ref){
1092  $move_sth->execute($alt_to_ref->{$key}, $key);
1093  $move_count += $move_sth->rows;
1094 
1095  $del_ix_sth->execute($key);
1096  $del_ix_count += $del_ix_sth->rows;
1097 
1098  $del_sth->execute($key);
1099  $del_ox_count += $del_sth->rows;
1100  }
1101  $move_sth->finish;
1102  $del_sth->finish;
1103  $del_ix_sth->finish;
1104 
1105  print "Number of rows:- moved = $move_count, identitys deleted = $del_ix_count, object_xrefs deleted = $del_ox_count\n";
1106  #
1107  # Now we have all the data on the reference Gene we want to copy all the data
1108  # onto the alt alleles.
1109  #
1110 
1111 
1112  my $get_data_sql=(<<'GET');
1113 SELECT ox.object_xref_id, ox.ensembl_object_type, ox.xref_id, ox.linkage_annotation,
1114  ox.linkage_type, ox.ox_status, ox.unused_priority, ox.master_xref_id,
1115  ix.query_identity, ix.target_identity, ix.hit_start, ix.hit_end,
1116  ix.translation_start, ix.translation_end, ix.cigar_line, ix.score, ix.evalue
1117  FROM xref x, source s, object_xref ox
1118  LEFT JOIN identity_xref ix ON ox.object_xref_id =ix.object_xref_id
1119  WHERE x.source_id = s.source_id AND
1120  ox.xref_id = x.xref_id AND
1121  ox.ensembl_id = ? AND
1122  ox.ox_status = 'DUMP_OUT' AND
1123  ox.ensembl_object_type = 'Gene' AND
1124  s.name in (
1125 GET
1126 
1127  $get_data_sql .= "'".join("', '",$self->get_gene_specific_list()) . "')";
1128 
1129  my $get_data_sth = $self->xref->dbc->prepare($get_data_sql) || croak "Could not prepare $get_data_sql";
1130 
1131 
1132 
1133  my $insert_object_xref_sql =(<<'INO');
1134 INSERT INTO object_xref (object_xref_id, ensembl_id, ensembl_object_type, xref_id, linkage_annotation,
1135  linkage_type, ox_status, unused_priority, master_xref_id)
1136  VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
1137 INO
1138 
1139  my $insert_ox_sth = $self->xref->dbc->prepare($insert_object_xref_sql) || croak "Could not prepare $insert_object_xref_sql";
1140 
1141 
1142  my $insert_identity_xref_sql = (<<'INI');
1143 INSERT INTO identity_xref (object_xref_id, query_identity, target_identity, hit_start, hit_end,
1144  translation_start, translation_end, cigar_line, score, evalue )
1145  VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1146 INI
1147 
1148  my $insert_ix_sth = $self->xref->dbc->prepare($insert_identity_xref_sql) || croak "Could not prepare $insert_identity_xref_sql";
1149 
1150 
1151 
1152  my $max_object_xref_id;
1153 
1154  my $sth = $self->xref->dbc->prepare("SELECT MAX(object_xref_id) FROM object_xref");
1155  $sth->execute();
1156  $sth->bind_columns(\$max_object_xref_id);
1157  $sth->fetch;
1158  if((!defined($max_object_xref_id)) or (!$max_object_xref_id)){
1159  croak "Problem getting max object_xref_id";
1160  }
1161  $max_object_xref_id++;
1162 
1163  my $added_count = 0;
1164  my $ignored = 0;
1165  foreach my $key (keys %$ref_to_alts){
1166  $get_data_sth->execute($key);
1167  my ($object_xref_id, $ensembl_object_type, $xref_id, $linkage_annotation,
1168  $linkage_type, $ox_status, $unused_priority, $master_xref_id,
1169  $query_identity, $target_identity, $hit_start, $hit_end,
1170  $translation_start, $translation_end, $cigar_line, $score, $evalue);
1171 
1172  $get_data_sth->bind_columns(\$object_xref_id, \$ensembl_object_type, \$xref_id, \$linkage_annotation,
1173  \$linkage_type, \$ox_status, \$unused_priority, \$master_xref_id,
1174  \$query_identity, \$target_identity, \$hit_start, \$hit_end,
1175  \$translation_start, \$translation_end, \$cigar_line, \$score, \$evalue);
1176 
1177  while( $get_data_sth->fetch()){
1178  foreach my $alt (@{$ref_to_alts->{$key}}){
1179  $max_object_xref_id++;
1180  $insert_ox_sth->execute($max_object_xref_id, $alt, $ensembl_object_type, $xref_id, $linkage_annotation,
1181  $linkage_type, $ox_status, $unused_priority, $master_xref_id) || croak "Could not insert object_xref data";
1182 
1183 #ONLY add identity xref if object_xref was added successfully.
1184  if( $insert_ox_sth->rows){
1185  $added_count++;
1186  $insert_ix_sth->execute($max_object_xref_id, $query_identity, $target_identity, $hit_start, $hit_end,
1187  $translation_start, $translation_end, $cigar_line, $score, $evalue) || croak "Could not insert identity_xref data";
1188  }
1189  else{
1190  $ignored++;
1191  }
1192  }
1193  }
1194  }
1195  print "Added $added_count new mapping but ignored $ignored\n";
1196 
1197  if($tester->unlinked_entries){
1198  croak "Problems found after process_alt_alleles\n";
1199  }
1200 
1201  $self->update_process_status('alt_alleles_processed');
1202  return;
1203 }
1204 
1205 
1206 #
1207 # These sources should be on the gene, even if they are mapped transcript or translation.
1208 # We define which ones are to be moved here
1209 #
1210 sub get_gene_specific_list {
1211  my $self = shift;
1212  my $dbi = shift;
1213 
1214  $dbi = $self->xref->dbc unless defined $dbi;
1215 
1216  my @list = qw(DBASS3 DBASS5 EntrezGene miRBase RFAM TRNASCAN_SE RNAMMER UniGene Uniprot_gn WikiGene MIM_GENE MIM_MORBID HGNC MGI ZFIN_ID FlyBaseName_gene RGD SGD_GENE VGNC wormbase_gseqname wormbase_locus Xenbase GeneCards);
1217 
1218  # Check the sources are used in the database considered
1219  my (@used_list, $sql, $sth, $count);
1220  foreach my $source (@list) {
1221  $sql = "SELECT COUNT(*) FROM xref x, source s WHERE s.source_id = x.source_id AND s.name = '$source';";
1222  $sth = $dbi->prepare($sql);
1223  $sth->execute();
1224  $sth->bind_columns(\$count);
1225  $sth->fetch();
1226  $sth->finish();
1227  if ($count > 0) {
1228  push @used_list, $source;
1229  }
1230  }
1231 
1232  return @used_list;
1233 }
1234 
1235 
1236 
1237 #
1238 # Here we do the moving.
1239 #
1240 sub source_defined_move{
1241  my $self = shift;
1242  my $dbi = shift;
1243 
1244  foreach my $source ($self->get_gene_specific_list($dbi)){
1245  $self->biomart_fix($source,"Translation","Gene", undef, undef, $dbi);
1246  $self->biomart_fix($source,"Transcript","Gene", undef, undef, $dbi);
1247  }
1248  my $tester = XrefMapper::TestMappings->new($self);
1249  if($tester->unlinked_entries){
1250  croak "Problems found after source_defined_move\n";
1251  }
1252  $self->update_process_status('source_level_move_finished');
1253  return;
1254 }
1255 
1256 1;
XrefMapper::db
Definition: db.pm:5
XrefMapper::db::dbc
public dbc()
XrefMapper::BasicMapper
Definition: BasicMapper.pm:8
XrefMapper::BasicMapper::core
public XrefMapper::db core()
info
public info()
process_file
public process_file()
Bio::EnsEMBL::DBSQL::AltAlleleGroupAdaptor
Definition: AltAlleleGroupAdaptor.pm:36