ensembl-hive  2.8.1
CoordinateMapper.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 package XrefMapper::CoordinateMapper;
21 
22 # $Id$
23 
24 # This is a set of subroutines used for creating Xrefs based on
25 # coordinate overlaps.
26 
27 
28 use strict;
29 use warnings;
30 
33 
34 use DBI qw( :sql_types );
35 
36 use Carp;
37 use IO::File;
38 use File::Spec::Functions;
39 
40 
41 use base qw( Exporter XrefMapper::BasicMapper);
42 
43 our @EXPORT = qw( run_coordinatemapping );
44 
45 our $coding_weight = 2;
46 our $ens_weight = 3;
47 
48 our $transcript_score_threshold = 0.75;
49 
50 my $verbose = 0;
51 
52 sub new {
53  my($class, $mapper) = @_;
54 
55  my $self ={};
56  bless $self,$class;
57  $self->core($mapper->core);
58  $self->xref($mapper->xref);
59  $self->mapper($mapper);
60  $verbose = $mapper->verbose();
61  return $self;
62 }
63 
64 sub mapper{
65  my ($self, $arg) = @_;
66 
67  (defined $arg) &&
68  ($self->{_mapper} = $arg );
69  return $self->{_mapper};
70 }
71 
72 
73 sub run_coordinatemapping {
74  my ( $self, $do_upload, $species_id) = @_;
75 
76 
77  my $sth_stat = $self->xref->dbc->prepare(
78  "INSERT INTO process_status (status, date) "
79  . "VALUES('coordinate_xrefs_started',now())" );
80  $sth_stat->execute();
81  $sth_stat->finish();
82 
83  my $xref_db = $self->xref();
84  my $core_db = $self->core();
85 
86  my $species = $core_db->species();
87 # my $species_id = $self->mapper->core->species;
88  $species_id = XrefMapper::BasicMapper::get_species_id_from_species_name( $xref_db, $species ) unless defined $species_id;
89  if (!defined $species_id) { return; }
90 
91 
92  # We only do coordinate mapping for mouse and human for now.
93  if ( !( $species eq 'mus_musculus' || $species eq 'homo_sapiens' ) ) {
94  # if ( !( $species eq 'mus_musculus') ) {
95  my $sth_stat = $self->xref->dbc->prepare(
96  "INSERT INTO process_status (status, date) "
97  . "VALUES ('coordinate_xref_finished',now())" );
98  $sth_stat->execute();
99  $sth_stat->finish();
100  return;
101  }
102  print "species = $species ($species_id))\n";
103 
104  my $output_dir = $core_db->dir();
105 
106  my $xref_filename = catfile( $output_dir, 'xref_coord.txt' );
107  my $object_xref_filename =
108  catfile( $output_dir, 'object_xref_coord.txt' );
109  my $unmapped_reason_filename =
110  catfile( $output_dir, 'unmapped_reason_coord.txt' );
111  my $unmapped_object_filename =
112  catfile( $output_dir, 'unmapped_object_coord.txt' );
113 
114  my $xref_dbh = $xref_db->dbc()->db_handle();
115  my $core_dbh = $core_db->dbc()->db_handle();
116 
117  ######################################################################
118  # Figure out the last used 'xref_id', 'object_xref_id', #
119  # 'unmapped_object_id', and 'unmapped_reason_id' from the Core #
120  # database. #
121  ######################################################################
122 
123  my $xref_id =
124  $core_dbh->selectall_arrayref('SELECT MAX(xref_id) FROM xref')
125  ->[0][0];
126  my $object_xref_id = $core_dbh->selectall_arrayref(
127  'SELECT MAX(object_xref_id) FROM object_xref')->[0][0];
128  my $unmapped_object_id = $core_dbh->selectall_arrayref(
129  'SELECT MAX(unmapped_object_id) FROM unmapped_object')->[0][0];
130  my $unmapped_reason_id = $core_dbh->selectall_arrayref(
131  'SELECT MAX(unmapped_reason_id) FROM unmapped_reason')->[0][0];
132 
133  log_progress( "Last used xref_id is %d\n", $xref_id );
134  log_progress( "Last used object_xref_id is %d\n",
135  $object_xref_id );
136  log_progress( "Last used unmapped_object_id is %d\n",
137  $unmapped_object_id );
138  log_progress( "Last used unmapped_reason_id is %d\n",
139  $unmapped_reason_id );
140 
141  ######################################################################
142  # Get an 'analysis_id', or discover that we need to add our analysis #
143  # to the 'analyis' table later. #
144  ######################################################################
145 
146  my $analysis_params =
147  sprintf( "weights(coding,ensembl)="
148  . "%.2f,%.2f;"
149  . "transcript_score_threshold=" . "%.2f",
150  $coding_weight, $ens_weight, $transcript_score_threshold );
151 
152  my $analysis_sql = qq(
153  SELECT analysis_id
154  FROM analysis
155  WHERE logic_name = 'xrefcoordinatemapping'
156  AND parameters = ?
157  );
158 
159  my $analysis_sth = $core_dbh->prepare($analysis_sql);
160  $analysis_sth->bind_param( 1, $analysis_params, SQL_VARCHAR );
161 
162  $analysis_sth->execute();
163 
164  my $analysis_id = $analysis_sth->fetchall_arrayref()->[0][0];
165  if ( !defined($analysis_id) ) {
166  $analysis_id =
167  $core_dbh->selectall_arrayref( "SELECT analysis_id FROM analysis "
168  . "WHERE logic_name = 'xrefcoordinatemapping'" )->[0][0];
169 
170  if ( defined($analysis_id) && $do_upload ) {
171  log_progress( "Will update 'analysis' table "
172  . "with new parameter settings\n" );
173 
174  #-----------------------------------------------------------------
175  # Update an existing analysis.
176  #-----------------------------------------------------------------
177 
178  my $sql = qq(
179  UPDATE analysis
180  SET created = now(), parameters = ?
181  WHERE analysis_id = ?
182  );
183 
184  $core_dbh->do( $sql, undef, $analysis_params, $analysis_id );
185 
186  } else {
187  log_progress("Can not find analysis ID for this analysis:\n");
188  log_progress(" logic_name = 'xrefcoordinatemapping'\n");
189  log_progress( " parameters = '%s'\n", $analysis_params );
190 
191  if ($do_upload) {
192 
193  #---------------------------------------------------------------
194  # Store a new analysis.
195  #---------------------------------------------------------------
196 
197  log_progress("A new analysis will be added\n");
198 
199  $analysis_id = $core_dbh->selectall_arrayref(
200  'SELECT MAX(analysis_id) FROM analysis')->[0][0];
201  log_progress( "Last used analysis_id is %d\n", $analysis_id );
202 
203  my $sql =
204  'INSERT INTO analysis '
205  . 'VALUES(?, now(), ?, \N, \N, \N, ?, '
206  . '\N, \N, ?, ?, \N, \N, \N)';
207  my $sth = $core_dbh->prepare($sql);
208 
209  $sth->bind_param( 1, ++$analysis_id, SQL_INTEGER );
210  $sth->bind_param( 2, 'xrefcoordinatemapping', SQL_VARCHAR );
211  $sth->bind_param( 3, 'CoordinateMapper.pm', SQL_VARCHAR );
212  $sth->bind_param( 4, $analysis_params, SQL_VARCHAR );
213  $sth->bind_param( 5, 'CoordinateMapper.pm', SQL_VARCHAR );
214 
215  $sth->execute();
216  }
217  } ## end else [ if ( defined($analysis_id...
218  } ## end if ( !defined($analysis_id...
219 
220  if ( defined($analysis_id) ) {
221  log_progress( "Analysis ID is %d\n",
222  $analysis_id );
223  }
224 
225  ######################################################################
226  # Read and store available Xrefs from the Xref database. #
227  ######################################################################
228 
229  my %unmapped;
230  my %mapped;
231 
232  my $xref_sql = qq(
233  SELECT coord_xref_id, source_id, accession
234  FROM coordinate_xref
235  WHERE species_id = ?
236  );
237 
238  my $xref_sth = $xref_dbh->prepare($xref_sql);
239  $xref_sth->bind_param( 1, $species_id, SQL_INTEGER );
240 
241  $xref_sth->execute();
242 
243  my $external_db_id;
244 
245  while ( my $xref = $xref_sth->fetchrow_hashref() ) {
246  $external_db_id ||=
247  $XrefMapper::BasicMapper::source_to_external_db{ $xref->{
248  'source_id'} };
249  $external_db_id ||= 11000; # FIXME (11000 is 'UCSC')
250 
251  $unmapped{ $xref->{'coord_xref_id'} } = {
252  'external_db_id' => $external_db_id,
253  'accession' => $xref->{'accession'},
254  'reason' => 'No overlap',
255  'reason_full' =>
256  'No coordinate overlap with any Ensembl transcript'
257  };
258  }
259  $xref_sth->finish();
260 
261  if(!defined($external_db_id)){
262  die "External_db_id is undefined species_id = $species_id\n";
263  }
264 
265  ######################################################################
266  # Do coordinate matching. #
267  ######################################################################
268 
269  my $core_db_adaptor =
271  -host => $core_db->dbc()->host(),
272  -port => $core_db->dbc()->port(),
273  -user => $core_db->dbc()->username(),
274  -pass => $core_db->dbc()->password(),
275  -dbname => $core_db->dbc()->dbname(),
276  );
277 
278  my $slice_adaptor = $core_db_adaptor->get_SliceAdaptor();
279  my @chromosomes = @{ $slice_adaptor->fetch_all('Chromosome') };
280 
281  my $sql = qq(
282  SELECT coord_xref_id, accession,
283  txStart, txEnd,
284  cdsStart, cdsEnd,
285  exonStarts, exonEnds
286  FROM coordinate_xref
287  WHERE species_id = ?
288  AND chromosome = ? AND strand = ?
289  AND ((txStart BETWEEN ? AND ?) -- txStart in region
290  OR (txEnd BETWEEN ? AND ?) -- txEnd in region
291  OR (txStart <= ? AND txEnd >= ?)) -- region is fully contained
292  ORDER BY accession
293  );
294 
295  foreach my $chromosome (@chromosomes) {
296  my $chr_name = $chromosome->seq_region_name();
297 
298  log_progress( "Processing chromsome '%s'\n", $chr_name );
299 
300  my @genes = @{ $chromosome->get_all_Genes( undef, undef, 1 ) };
301 
302  log_progress( "There are %4d genes on chromosome '%s'\n",
303  scalar(@genes), $chr_name );
304 
305  while ( my $gene = shift(@genes) ) {
306  my @transcripts = @{ $gene->get_all_Transcripts() };
307 
308  my %gene_result;
309 
310  foreach my $transcript ( sort { $a->start() <=> $b->start() }
311  @transcripts )
312  {
313 
314  ################################################################
315  # For each Ensembl transcript: #
316  # 1. Register all Ensembl exons in a RangeRegistry. #
317  # #
318  # 2. Find all transcripts in the external database that are #
319  # within the range of this Ensembl transcript. #
320  # #
321  # For each of those external transcripts: #
322  # 3. Calculate the overlap of the exons of the external #
323  # transcript with the Ensembl exons using the #
324  # overlap_size() method in the RangeRegistry. #
325  # #
326  # 4. Register the external exons in their own RangeRegistry. #
327  # #
328  # 5. Calculate the overlap of the Ensembl exons with the #
329  # external exons as in step 3. #
330  # #
331  # 6. Calculate the match score. #
332  # #
333  # 7. Decide whether or not to keep the match. #
334  ################################################################
335 
336  my @exons = @{ $transcript->get_all_Exons() };
337 
338  my %transcript_result;
339 
340  # '$rr1' is the RangeRegistry holding Ensembl exons for one
341  # transcript at a time.
343 
344  my $coding_transcript;
345  if ( defined( $transcript->translation() ) ) {
346  $coding_transcript = 1;
347  } else {
348  $coding_transcript = 0;
349  }
350 
351  foreach my $exon (@exons) {
352 
353  #-------------------------------------------------------------
354  # Register each exon in the RangeRegistry. Register both the
355  # total length of the exon and the coding range of the exon.
356  #-------------------------------------------------------------
357 
358  $rr1->check_and_register( 'exon', $exon->start(),
359  $exon->end() );
360 
361  if ( $coding_transcript
362  && defined( $exon->coding_region_start($transcript) )
363  && defined( $exon->coding_region_end($transcript) ) )
364  {
365  $rr1->check_and_register(
366  'coding',
367  $exon->coding_region_start($transcript),
368  $exon->coding_region_end($transcript) );
369  }
370  }
371 
372  #---------------------------------------------------------------
373  # Get hold of all transcripts from the external database that
374  # overlaps with this Ensembl transcript.
375  #---------------------------------------------------------------
376 
377  my $sth = $xref_dbh->prepare_cached($sql);
378  $sth->bind_param( 1, $species_id, SQL_INTEGER );
379  $sth->bind_param( 2, $chr_name, SQL_VARCHAR );
380  $sth->bind_param( 3, $gene->strand(), SQL_INTEGER );
381  $sth->bind_param( 4, $transcript->start(), SQL_INTEGER );
382  $sth->bind_param( 5, $transcript->end(), SQL_INTEGER );
383  $sth->bind_param( 6, $transcript->start(), SQL_INTEGER );
384  $sth->bind_param( 7, $transcript->end(), SQL_INTEGER );
385  $sth->bind_param( 8, $transcript->start(), SQL_INTEGER );
386  $sth->bind_param( 9, $transcript->end(), SQL_INTEGER );
387 
388  $sth->execute();
389 
390  my ( $coord_xref_id, $accession, $txStart, $txEnd, $cdsStart,
391  $cdsEnd, $exonStarts, $exonEnds );
392 
393  $sth->bind_columns(
394  \( $coord_xref_id, $accession, $txStart, $txEnd,
395  $cdsStart, $cdsEnd, $exonStarts, $exonEnds
396  ) );
397 
398  while ( $sth->fetch() ) {
399  my @exonStarts = split( /,\s*/, $exonStarts );
400  my @exonEnds = split( /,\s*/, $exonEnds );
401  my $exonCount = scalar(@exonStarts);
402 
403  # '$rr2' is the RangeRegistry holding exons from the external
404  # transcript, for one transcript at a time.
406 
407  my $exon_match = 0;
408  my $coding_match = 0;
409 
410  my $coding_count = 0;
411 
412  for ( my $i = 0 ; $i < $exonCount ; ++$i ) {
413 
414  #-----------------------------------------------------------
415  # Register the exons from the external database in the same
416  # was as with the Ensembl exons, and calculate the overlap
417  # of the external exons with the previously registered
418  # Ensembl exons.
419  #-----------------------------------------------------------
420 
421  my $overlap =
422  $rr1->overlap_size( 'exon', $exonStarts[$i],
423  $exonEnds[$i] );
424 
425  $exon_match +=
426  $overlap/( $exonEnds[$i] - $exonStarts[$i] + 1 );
427 
428  $rr2->check_and_register( 'exon', $exonStarts[$i],
429  $exonEnds[$i] );
430 
431  if ( !defined($cdsStart) || !defined($cdsEnd) ) {
432  # Non-coding transcript.
433  } else {
434  my $codingStart = ( $exonStarts[$i] > $cdsStart
435  ? $exonStarts[$i]
436  : $cdsStart );
437  my $codingEnd =
438  ( $exonEnds[$i] < $cdsEnd ? $exonEnds[$i] : $cdsEnd );
439 
440  if ( $codingStart < $codingEnd ) {
441  my $coding_overlap =
442  $rr1->overlap_size( 'coding', $codingStart,
443  $codingEnd );
444 
445  $coding_match +=
446  $coding_overlap/( $codingEnd - $codingStart + 1 );
447 
448  $rr2->check_and_register( 'coding', $codingStart,
449  $codingEnd );
450 
451  ++$coding_count;
452  }
453  }
454  } ## end for ( my $i = 0 ; $i < ...
455 
456  my $rexon_match = 0;
457  my $rcoding_match = 0;
458 
459  my $rcoding_count = 0;
460 
461  foreach my $exon (@exons) {
462 
463  #-----------------------------------------------------------
464  # Calculate the overlap of the Ensembl exons with the
465  # external exons.
466  #-----------------------------------------------------------
467 
468  my $overlap =
469  $rr2->overlap_size( 'exon', $exon->start(),
470  $exon->end() );
471 
472  $rexon_match +=
473  $overlap/( $exon->end() - $exon->start() + 1 );
474 
475  if ( $coding_transcript
476  && defined( $exon->coding_region_start($transcript) )
477  && defined( $exon->coding_region_end($transcript) ) )
478  {
479  my $coding_overlap =
480  $rr2->overlap_size( 'coding',
481  $exon->coding_region_start(
482  $transcript),
483  $exon->coding_region_end(
484  $transcript)
485  );
486 
487  $rcoding_match +=
488  $coding_overlap/
489  ( $exon->coding_region_end($transcript) -
490  $exon->coding_region_start($transcript) +
491  1 );
492 
493  ++$rcoding_count;
494  }
495  } ## end foreach my $exon (@exons)
496 
497  #-------------------------------------------------------------
498  # Calculate the match score.
499  #-------------------------------------------------------------
500 
501  my $score = ( ( $exon_match + $ens_weight*$rexon_match ) +
502  $coding_weight*(
503  $coding_match + $ens_weight*$rcoding_match
504  )
505  )/( ( $exonCount + $ens_weight*scalar(@exons) ) +
506  $coding_weight*(
507  $coding_count + $ens_weight*$rcoding_count
508  ) );
509 
510  if ( !defined( $transcript_result{$coord_xref_id} )
511  || $transcript_result{$coord_xref_id} < $score )
512  {
513  $transcript_result{$coord_xref_id} = $score;
514  }
515 
516  } ## end while ( $sth->fetch() )
517  $sth->finish();
518 
519  #---------------------------------------------------------------
520  # Apply transcript threshold and pick the best match(es) for
521  # this transcript.
522  #---------------------------------------------------------------
523 
524  my $best_score;
525  foreach my $coord_xref_id (
526  sort( { $transcript_result{$b} <=> $transcript_result{$a} }
527  keys(%transcript_result) ) )
528  {
529  my $score = $transcript_result{$coord_xref_id};
530 
531  if ( $score > $transcript_score_threshold ) {
532  $best_score ||= $score;
533 
534  if ( sprintf( "%.3f", $score ) eq
535  sprintf( "%.3f", $best_score ) )
536  {
537  if ( exists( $unmapped{$coord_xref_id} ) ) {
538  $mapped{$coord_xref_id} = $unmapped{$coord_xref_id};
539  delete( $unmapped{$coord_xref_id} );
540  $mapped{$coord_xref_id}{'reason'} = undef;
541  $mapped{$coord_xref_id}{'reason_full'} = undef;
542  $mapped{$coord_xref_id}{'chr_name'} = $chr_name;
543  }
544 
545  push( @{ $mapped{$coord_xref_id}{'mapped_to'} }, {
546  'ensembl_id' => $transcript->dbID(),
547  'ensembl_object_type' => 'Transcript'
548  } );
549 
550  # This is now a candidate Xref for the gene.
551  if ( !defined( $gene_result{$coord_xref_id} )
552  || $gene_result{$coord_xref_id} < $score )
553  {
554  $gene_result{$coord_xref_id} = $score;
555  }
556 
557  } elsif ( exists( $unmapped{$coord_xref_id} ) ) {
558  $unmapped{$coord_xref_id}{'reason'} =
559  'Was not best match';
560  $unmapped{$coord_xref_id}{'reason_full'} =
561  sprintf(
562  "Did not top best transcript match score (%.2f)",
563  $best_score );
564  if ( !defined( $unmapped{$coord_xref_id}{'score'} )
565  || $score > $unmapped{$coord_xref_id}{'score'} )
566  {
567  $unmapped{$coord_xref_id}{'score'} = $score;
568  $unmapped{$coord_xref_id}{'ensembl_id'} =
569  $transcript->dbID();
570  }
571  }
572 
573  } elsif ( exists( $unmapped{$coord_xref_id} )
574  && $unmapped{$coord_xref_id}{'reason'} ne
575  'Was not best match' )
576  {
577  $unmapped{$coord_xref_id}{'reason'} =
578  'Did not meet threshold';
579  $unmapped{$coord_xref_id}{'reason_full'} =
580  sprintf( "Match score for transcript "
581  . "lower than threshold (%.2f)",
582  $transcript_score_threshold );
583  if ( !defined( $unmapped{$coord_xref_id}{'score'} )
584  || $score > $unmapped{$coord_xref_id}{'score'} )
585  {
586  $unmapped{$coord_xref_id}{'score'} = $score;
587  $unmapped{$coord_xref_id}{'ensembl_id'} =
588  $transcript->dbID();
589  }
590  }
591  } ## end foreach my $coord_xref_id (...
592 
593  } ## end foreach my $transcript ( sort...
594 
595  #-----------------------------------------------------------------
596  # Pick the best match(es) for this gene.
597  #-----------------------------------------------------------------
598 
599  # Do not want them on both Transcripts and Genes. For Biomart purposes. (Ianl)
600  # So no need to find the best one for the gene.
601 
602 # my $best_score;
603 # foreach my $coord_xref_id (
604 # sort( { $gene_result{$b} <=> $gene_result{$a} }
605 # keys(%gene_result) ) )
606 # {
607 # my $score = $gene_result{$coord_xref_id};
608 #
609 # $best_score ||= $score;
610 #
611 # if (
612 # sprintf( "%.3f", $score ) eq sprintf( "%.3f", $best_score ) )
613 # {
614 # push( @{ $mapped{$coord_xref_id}{'mapped_to'} }, {
615 # 'ensembl_id' => $gene->dbID(),
616 # 'ensembl_object_type' => 'Gene'
617 # } );
618 # }
619 # }
620 
621  } ## end while ( my $gene = shift(...
622  } ## end foreach my $chromosome (@chromosomes)
623 
624  # Make all dumps. Order is important.
625  dump_xref( $xref_filename, $xref_id, \%mapped, \%unmapped );
626  dump_object_xref( $object_xref_filename, $object_xref_id, $analysis_id, \%mapped );
627  dump_unmapped_reason( $unmapped_reason_filename, $unmapped_reason_id,
628  \%unmapped, $core_dbh );
629  dump_unmapped_object( $unmapped_object_filename, $unmapped_object_id,
630  $analysis_id, \%unmapped );
631 
632  if ($do_upload) {
633  # Order is important!
634  upload_data( 'unmapped_reason', $unmapped_reason_filename,
635  $external_db_id, $core_dbh );
636  upload_data( 'unmapped_object', $unmapped_object_filename,
637  $external_db_id, $core_dbh );
638  upload_data( 'object_xref', $object_xref_filename, $external_db_id,
639  $core_dbh );
640  upload_data( 'xref', $xref_filename, $external_db_id, $core_dbh );
641  }
642 
643  $sth_stat = $self->xref->dbc->prepare(
644  "INSERT INTO process_status (status, date) "
645  . "VALUES ('coordinate_xref_finished',now())" );
646  $sth_stat->execute();
647  $sth_stat->finish();
648 
649  $self->biomart_fix("UCSC","Translation","Gene");
650  $self->biomart_fix("UCSC","Transcript","Gene");
651 
652 } ## end sub run_coordinatemapping
653 
654 #-----------------------------------------------------------------------
655 
656 sub dump_xref {
657  my ( $filename, $xref_id, $mapped, $unmapped ) = @_;
658 
659  ######################################################################
660  # Dump for 'xref'. #
661  ######################################################################
662 
663  my $fh = IO::File->new( '>' . $filename )
664  or croak( sprintf( "Can not open '%s' for writing", $filename ) );
665 
666  log_progress( "Dumping for 'xref' to '%s'\n", $filename );
667 
668  foreach my $xref ( values( %{$unmapped} ), values( %{$mapped} ) ) {
669  # Assign 'xref_id' to this Xref.
670  $xref->{'xref_id'} = ++$xref_id;
671 
672  my $accession = $xref->{'accession'};
673 
674  my ($version) = ( $accession =~ /\.(\d+)$/ );
675  $version ||= 0;
676 
677  my $info_text = (defined($xref->{'chr_name'}) && $xref->{'chr_name'} eq 'Y' ? "Y Chromosome" : "");
678 
679  $fh->printf("%d\t%d\t%s\t%s\t%d\t%s\t%s\t%s\n",
680  $xref->{'xref_id'},
681  $xref->{'external_db_id'},
682  $accession,
683  $accession,
684  $version,
685  '\N',
686  'COORDINATE_OVERLAP',
687  $info_text
688  );
689  }
690  $fh->close();
691 
692  log_progress("Dumping for 'xref' done\n");
693 
694 } ## end sub dump_xref
695 
696 #-----------------------------------------------------------------------
697 
698 sub dump_object_xref {
699  my ( $filename, $object_xref_id, $analysis_id, $mapped ) = @_;
700 
701  ######################################################################
702  # Dump for 'object_xref'. #
703  ######################################################################
704 
705  my $fh = IO::File->new( '>' . $filename )
706  or croak( sprintf( "Can not open '%s' for writing", $filename ) );
707 
708  log_progress( "Dumping for 'object_xref' to '%s'\n", $filename );
709 
710  foreach my $xref ( values( %{$mapped} ) ) {
711  foreach my $object_xref ( @{ $xref->{'mapped_to'} } ) {
712  # Assign 'object_xref_id' to this Object Xref.
713  $object_xref->{'object_xref_id'} = ++$object_xref_id;
714 
715  $fh->printf( "%d\t%d\t%s\t%d\t%s\t%s\n",
716  $object_xref->{'object_xref_id'},
717  $object_xref->{'ensembl_id'},
718  $object_xref->{'ensembl_object_type'},
719  $xref->{'xref_id'},
720  '\N',
721  $analysis_id );
722  }
723  }
724  $fh->close();
725 
726  log_progress("Dumping for 'object_xref' done\n");
727 
728 } ## end sub dump_objexref
729 
730 #-----------------------------------------------------------------------
731 
732 sub dump_unmapped_reason {
733  my ( $filename, $unmapped_reason_id, $unmapped, $core_dbh ) = @_;
734 
735  ######################################################################
736  # Dump for 'unmapped_reason'. #
737  ######################################################################
738 
739  # Create a list of the unique reasons.
740  my %reasons;
741 
742  foreach my $xref ( values( %{$unmapped} ) ) {
743  if ( !exists( $reasons{ $xref->{'reason_full'} } ) ) {
744  $reasons{ $xref->{'reason_full'} } = {
745  'summary' => $xref->{'reason'},
746  'full' => $xref->{'reason_full'}
747  };
748  }
749  }
750 
751  my $fh = IO::File->new( '>' . $filename )
752  or croak( sprintf( "Can not open '%s' for writing", $filename ) );
753 
754  log_progress( "Dumping for 'unmapped_reason' to '%s'\n", $filename );
755 
756  my $sth =
757  $core_dbh->prepare( 'SELECT unmapped_reason_id '
758  . 'FROM unmapped_reason '
759  . 'WHERE full_description = ?' );
760 
761  foreach my $reason (
762  sort( { $a->{'full'} cmp $b->{'full'} } values(%reasons) ) )
763  {
764  # Figure out 'unmapped_reason_id' from the core database.
765  $sth->bind_param( 1, $reason->{'full'}, SQL_VARCHAR );
766 
767  $sth->execute();
768 
769  my $id;
770  $sth->bind_col( 1, \$id );
771  $sth->fetch();
772 
773  if ( defined($id) ) {
774  $reason->{'unmapped_reason_id'} = $id;
775  } else {
776  $reason->{'unmapped_reason_id'} = ++$unmapped_reason_id;
777  }
778 
779  $sth->finish();
780 
781  $fh->printf( "%d\t%s\t%s\n", $reason->{'unmapped_reason_id'},
782  $reason->{'summary'}, $reason->{'full'} );
783 
784  }
785  $fh->close();
786 
787  log_progress("Dumping for 'unmapped_reason' done\n");
788 
789  # Assign reasons to the unmapped Xrefs from %reasons.
790  foreach my $xref ( values( %{$unmapped} ) ) {
791  $xref->{'reason'} = $reasons{ $xref->{'reason_full'} };
792  $xref->{'reason_full'} = undef;
793  }
794 
795 } ## end sub dump_unmapped_reason
796 
797 #-----------------------------------------------------------------------
798 
799 sub dump_unmapped_object {
800  my ( $filename, $unmapped_object_id, $analysis_id, $unmapped ) = @_;
801 
802  ######################################################################
803  # Dump for 'unmapped_object'. #
804  ######################################################################
805 
806  my $fh = IO::File->new( '>' . $filename )
807  or croak( sprintf( "Can not open '%s' for writing", $filename ) );
808 
809  log_progress( "Dumping for 'unmapped_object' to '%s'\n", $filename );
810 
811  foreach my $xref ( values( %{$unmapped} ) ) {
812  # Assign 'unmapped_object_id' to this Xref.
813  $xref->{'unmapped_object_id'} = ++$unmapped_object_id;
814 
815  $fh->printf(
816  "%d\t%s\t%s\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n",
817  $xref->{'unmapped_object_id'},
818  'xref',
819  $analysis_id || '\N', # '\N' (NULL) means no analysis exists
820  # and uploading this table will fail.
821  $xref->{'external_db_id'},
822  $xref->{'accession'},
823  $xref->{'reason'}->{'unmapped_reason_id'}, (
824  defined( $xref->{'score'} )
825  ? sprintf( "%.3f", $xref->{'score'} )
826  : '\N'
827  ),
828  '\N',
829  $xref->{'ensembl_id'} || '\N',
830  ( defined( $xref->{'ensembl_id'} ) ? 'Transcript' : '\N' ),
831  '\N' );
832  }
833  $fh->close();
834 
835  log_progress("Dumping for 'unmapped_object' done\n");
836 
837 } ## end sub dump_unmapped_object
838 
839 #-----------------------------------------------------------------------
840 
841 sub upload_data {
842  my ( $table_name, $filename, $external_db_id, $dbh ) = @_;
843 
844  ######################################################################
845  # Upload data from a file to a table. #
846  ######################################################################
847 
848  if ( !-r $filename ) {
849  croak( sprintf( "Can not open '%s' for reading", $filename ) );
850  }
851 
852  my $cleanup_sql = '';
853  if ( $table_name eq 'unmapped_reason' ) {
854  $cleanup_sql = qq(
855  DELETE ur
856  FROM unmapped_object uo,
857  unmapped_reason ur
858  WHERE uo.external_db_id = ?
859  AND ur.unmapped_reason_id = uo.unmapped_reason_id);
860  } elsif ( $table_name eq 'unmapped_object' ) {
861  $cleanup_sql = qq(
862  DELETE uo
863  FROM unmapped_object uo
864  WHERE uo.external_db_id = ?);
865  } elsif ( $table_name eq 'object_xref' ) {
866  $cleanup_sql = qq(
867  DELETE ox
868  FROM xref x,
869  object_xref ox
870  WHERE x.external_db_id = ?
871  AND ox.xref_id = x.xref_id);
872  } elsif ( $table_name eq 'xref' ) {
873  $cleanup_sql = qq(
874  DELETE x
875  FROM xref x
876  WHERE x.external_db_id = ?);
877  } else {
878  croak( sprintf( "Table '%s' is unknown\n", $table_name ) );
879  }
880 
881  my $load_sql =
882  sprintf( "LOAD DATA LOCAL INFILE ? REPLACE INTO TABLE %s",
883  $table_name );
884 
885  log_progress(
886  "Removing old data (external_db_id = '%d') from table '%s'\n",
887  $external_db_id, $table_name );
888 
889  my $rows = $dbh->do( $cleanup_sql, undef, $external_db_id )
890  or croak( $dbh->strerr() );
891 
892  log_progress( "Removed %d rows\n", $rows );
893 
894  log_progress( "Uploading for '%s' from '%s'\n",
895  $table_name, $filename );
896 
897  $rows = $dbh->do( $load_sql, undef, $filename )
898  or croak( $dbh->errstr() );
899 
900  $dbh->do("OPTIMIZE TABLE $table_name") or croak( $dbh->errstr() );
901 
902  log_progress( "Uploading for '%s' done (%d rows)\n",
903  $table_name, $rows );
904 
905 } ## end sub upload_data
906 
907 #-----------------------------------------------------------------------
908 
909 sub log_progress {
910  my ( $fmt, @params ) = @_;
911 
912  return if (!$verbose);
913  printf( STDERR "COORD==> %s", sprintf( $fmt, @params ) );
914 }
915 
916 1;
Bio::EnsEMBL::Mapper::RangeRegistry
Definition: RangeRegistry.pm:51
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
XrefMapper::BasicMapper
Definition: BasicMapper.pm:8
accession
public accession()
Bio::EnsEMBL::Mapper::RangeRegistry::overlap_size
public Int overlap_size()
Bio::EnsEMBL::DBSQL::DBAdaptor::new
public Bio::EnsEMBL::DBSQL::DBAdaptor new()
XrefMapper::BasicMapper::get_species_id_from_species_name
public get_species_id_from_species_name()
Bio::EnsEMBL::Mapper::RangeRegistry::new
public Bio::EnsEMBL::Mapper::RangeRegistry new()
Bio::EnsEMBL::Mapper::RangeRegistry::check_and_register
public Undef check_and_register()