3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
22 # This is a set of subroutines used for creating Xrefs based on
23 # coordinate overlaps.
25 package XrefMapper::VBCoordinateMapper;
35 use File::Spec::Functions;
42 our @EXPORT = qw( run_coordinatemapping );
44 our $coding_weight = 2;
47 our $transcript_score_threshold = 0.75;
49 sub run_coordinatemapping {
50 print STDERR
"RUNNING VB COORD MAPPING\n";
51 my ( $mapper, $do_upload ) = @_;
53 my $xref_db = $mapper->xref();
54 my $core_db = $mapper->core();
56 my $species = $core_db->species();
61 # We only do coordinate mapping for mouse and human for now.
62 if ( !( $species eq
'mus_musculus' || $species eq
'homo_sapiens' ) ) {
66 my $output_dir = $core_db->dir();
68 my $xref_filename = catfile( $output_dir,
'xref_coord.txt' );
69 my $object_xref_filename =
70 catfile( $output_dir,
'object_xref_coord.txt' );
71 my $unmapped_reason_filename =
72 catfile( $output_dir,
'unmapped_reason_coord.txt' );
73 my $unmapped_object_filename =
74 catfile( $output_dir,
'unmapped_object_coord.txt' );
76 my $xref_dbh = $xref_db->dbc()->db_handle();
77 my $core_dbh = $core_db->dbc()->db_handle();
79 ######################################################################
80 # Figure out the last used 'xref_id', 'object_xref_id', #
81 # 'unmapped_object_id', and 'unmapped_reason_id' from the Core #
83 ######################################################################
86 $core_dbh->selectall_arrayref(
'SELECT MAX(xref_id) FROM xref')
88 my $object_xref_id = $core_dbh->selectall_arrayref(
89 'SELECT MAX(object_xref_id) FROM object_xref')->[0][0];
90 my $unmapped_object_id = $core_dbh->selectall_arrayref(
91 'SELECT MAX(unmapped_object_id) FROM unmapped_object')->[0][0];
92 my $unmapped_reason_id = $core_dbh->selectall_arrayref(
93 'SELECT MAX(unmapped_reason_id) FROM unmapped_reason')->[0][0];
95 log_progress(
"Last used xref_id is %d\n", $xref_id );
96 log_progress(
"Last used object_xref_id is %d\n",
98 log_progress(
"Last used unmapped_object_id is %d\n",
99 $unmapped_object_id );
100 log_progress(
"Last used unmapped_reason_id is %d\n",
101 $unmapped_reason_id );
103 ######################################################################
104 # Get an 'analysis_id', or discover that we need to add our analysis #
105 # to the 'analyis' table later. #
106 ######################################################################
108 my $analysis_params =
109 sprintf(
"weights(coding,ensembl)="
111 .
"transcript_score_threshold=" .
"%.2f",
112 $coding_weight, $ens_weight, $transcript_score_threshold );
114 my $analysis_sql = qq(
117 WHERE logic_name =
'xrefcoordinatemapping'
121 my $analysis_sth = $core_dbh->prepare($analysis_sql);
122 $analysis_sth->execute($analysis_params);
124 my $analysis_id = $analysis_sth->fetchall_arrayref()->[0][0];
125 if ( !defined($analysis_id) ) {
127 $core_dbh->selectall_arrayref(
"SELECT analysis_id FROM analysis "
128 .
"WHERE logic_name = 'xrefcoordinatemapping'" )->[0][0];
130 if ( defined($analysis_id) && $do_upload ) {
131 log_progress(
"Will update 'analysis' table "
132 .
"with new parameter settings\n" );
134 #-----------------------------------------------------------------
135 # Update an existing analysis.
136 #-----------------------------------------------------------------
140 SET created = now(), parameters = ?
141 WHERE analysis_id = ?
144 $core_dbh->do( $sql, undef, $analysis_params, $analysis_id );
147 log_progress(
"Can not find analysis ID for this analysis:\n");
148 log_progress(
" logic_name = 'xrefcoordinatemapping'\n");
149 log_progress(
" parameters = '%s'\n", $analysis_params );
152 #---------------------------------------------------------------
153 # Store a new analysis.
154 #---------------------------------------------------------------
156 log_progress(
"A new analysis will be added\n");
158 $analysis_id = $core_dbh->selectall_arrayref(
159 'SELECT MAX(analysis_id) FROM analysis')->[0][0];
160 log_progress(
"Last used analysis_id is %d\n", $analysis_id );
162 my $sql =
'INSERT INTO analysis '
163 .
'VALUES(?, now(), ?, \N, \N, \N, ?, \N, \N, ?, ?, \N, \N, \N)';
164 my $sth = $core_dbh->prepare($sql);
166 $sth->execute( ++$analysis_id,
'xrefcoordinatemapping',
167 'CoordinateMapper.pm', $analysis_params,
168 'CoordinateMapper.pm' );
170 } ## end
else [
if ( defined($analysis_id...
171 } ## end
if ( !defined($analysis_id...
173 if ( defined($analysis_id) ) {
174 log_progress(
"Analysis ID is %d\n",
178 ######################################################################
179 # Read and store available Xrefs from the Xref database. #
180 ######################################################################
186 SELECT coord_xref_id, source_id,
accession
191 my $xref_sth = $xref_dbh->prepare($xref_sql);
192 $xref_sth->execute($species_id);
194 while ( my $xref = $xref_sth->fetchrow_hashref() ) {
195 $unmapped{ $xref->{
'coord_xref_id'} } = {
197 $XrefMapper::BasicMapper::source_to_external_db{ $xref->{
199 || 11000, # FIXME (11000 is
'UCSC')
200 'accession' => $xref->{
'accession'},
201 'reason' =>
'No overlap',
203 'No coordinate overlap with any Ensembl transcript' };
207 ######################################################################
208 # Do coordinate matching. #
209 ######################################################################
211 my $core_db_adaptor =
213 -host => $core_db->dbc()->host(),
214 -port => $core_db->dbc()->port(),
215 -user => $core_db->dbc()->username(),
216 -pass => $core_db->dbc()->password(),
217 -dbname => $core_db->dbc()->dbname(),
220 my $slice_adaptor = $core_db_adaptor->get_SliceAdaptor();
221 my @chromosomes = @{ $slice_adaptor->fetch_all(
'Chromosome') };
230 AND chromosome = ? AND strand = ?
231 AND ((txStart >= ? AND txStart <= ?) -- txStart in region
232 OR (txEnd >= ? AND txEnd <= ?) -- txEnd in region
233 OR (txStart <= ? AND txEnd >= ?)) -- region is contained
237 foreach my $chromosome (@chromosomes) {
238 my $chr_name = $chromosome->seq_region_name();
240 log_progress(
"Processing chromsome '%s'\n", $chr_name );
242 my @genes = @{ $chromosome->get_all_Genes( undef, undef, 1 ) };
244 log_progress(
"There are %4d genes on chromosome '%s'\n",
245 scalar(@genes), $chr_name );
247 while ( my $gene = shift(@genes) ) {
248 my @transcripts = @{ $gene->get_all_Transcripts() };
252 foreach my $transcript ( sort { $a->start() <=> $b->start() }
255 ################################################################
256 # For each Ensembl transcript: #
257 # 1. Register all Ensembl exons in a RangeRegistry. #
259 # 2. Find all transcripts in the external database that are #
260 # within the range of this Ensembl transcript. #
262 # For each of those external transcripts: #
263 # 3. Calculate the overlap of the exons of the external #
264 # transcript with the Ensembl exons using the #
265 # overlap_size() method in the RangeRegistry. #
267 # 4. Register the external exons in their own RangeRegistry. #
269 # 5. Calculate the overlap of the Ensembl exons with the #
270 # external exons as in step 3. #
272 # 6. Calculate the match score. #
274 # 7. Decide whether or not to keep the match. #
275 ################################################################
277 my @exons = @{ $transcript->get_all_Exons() };
279 my %transcript_result;
281 # '$rr1' is the RangeRegistry holding Ensembl exons for one
282 # transcript at a time.
285 my $coding_transcript;
286 if ( defined( $transcript->translation() ) ) {
287 $coding_transcript = 1;
289 $coding_transcript = 0;
292 foreach my $exon (@exons) {
293 #-------------------------------------------------------------
294 # Register each exon in the RangeRegistry. Register both the
295 # total length of the exon and the coding range of the exon.
296 #-------------------------------------------------------------
301 if ( $coding_transcript
302 && defined( $exon->coding_region_start($transcript) )
303 && defined( $exon->coding_region_end($transcript) ) )
305 $rr1->check_and_register(
307 $exon->coding_region_start($transcript),
308 $exon->coding_region_end($transcript) );
312 #---------------------------------------------------------------
313 # Get hold of all transcripts from the external database that
314 # overlaps with this Ensembl transcript.
315 #---------------------------------------------------------------
317 my $sth = $xref_dbh->prepare_cached($sql);
318 $sth->execute( $species_id, $chr_name,
319 $gene->strand(), $transcript->start(),
320 $transcript->end(), $transcript->start(),
321 $transcript->end(), $transcript->start(),
322 $transcript->end() );
324 my ( $coord_xref_id, $accession, $txStart, $txEnd, $cdsStart,
325 $cdsEnd, $exonStarts, $exonEnds );
328 \( $coord_xref_id, $accession, $txStart, $txEnd,
329 $cdsStart, $cdsEnd, $exonStarts, $exonEnds
332 while ( $sth->fetch() ) {
333 my @exonStarts = split( /,\s*/, $exonStarts );
334 my @exonEnds = split( /,\s*/, $exonEnds );
335 my $exonCount = scalar(@exonStarts);
337 # '$rr2' is the RangeRegistry holding exons from the external
338 # transcript, for one transcript at a time.
342 my $coding_match = 0;
344 my $coding_count = 0;
346 for ( my $i = 0 ; $i < $exonCount ; ++$i ) {
347 #-----------------------------------------------------------
348 # Register the exons from the external database in the same
349 # was as with the Ensembl exons, and calculate the overlap
350 # of the external exons with the previously registered
352 #-----------------------------------------------------------
359 $overlap/( $exonEnds[$i] - $exonStarts[$i] + 1 );
361 $rr2->check_and_register(
'exon', $exonStarts[$i],
364 if ( !defined($cdsStart) || !defined($cdsEnd) ) {
365 # Non-coding transcript.
367 my $codingStart = ( $exonStarts[$i] > $cdsStart
371 ( $exonEnds[$i] < $cdsEnd ? $exonEnds[$i] : $cdsEnd );
373 if ( $codingStart < $codingEnd ) {
375 $rr1->overlap_size(
'coding', $codingStart,
379 $coding_overlap/( $codingEnd - $codingStart + 1 );
381 $rr2->check_and_register(
'coding', $codingStart,
387 } ## end
for ( my $i = 0 ; $i < ...
390 my $rcoding_match = 0;
392 my $rcoding_count = 0;
394 foreach my $exon (@exons) {
395 #-----------------------------------------------------------
396 # Calculate the overlap of the Ensembl exons with the
398 #-----------------------------------------------------------
401 $rr2->overlap_size(
'exon', $exon->start(),
405 $overlap/( $exon->end() - $exon->start() + 1 );
407 if ( $coding_transcript
408 && defined( $exon->coding_region_start($transcript) )
409 && defined( $exon->coding_region_end($transcript) ) )
412 $rr2->overlap_size(
'coding',
413 $exon->coding_region_start(
415 $exon->coding_region_end(
421 ( $exon->coding_region_end($transcript) -
422 $exon->coding_region_start($transcript) +
427 } ## end
foreach my $exon (@exons)
429 #-------------------------------------------------------------
430 # Calculate the match score.
431 #-------------------------------------------------------------
433 my $score = ( ( $exon_match + $ens_weight*$rexon_match ) +
435 $coding_match + $ens_weight*$rcoding_match
437 )/( ( $exonCount + $ens_weight*scalar(@exons) ) +
439 $coding_count + $ens_weight*$rcoding_count
442 if ( !defined( $transcript_result{$coord_xref_id} )
443 || $transcript_result{$coord_xref_id} < $score )
445 $transcript_result{$coord_xref_id} = $score;
448 } ## end
while ( $sth->fetch() )
451 #---------------------------------------------------------------
452 # Apply transcript threshold and pick the best match(es) for
454 #---------------------------------------------------------------
457 foreach my $coord_xref_id (
458 sort( { $transcript_result{$b} <=> $transcript_result{$a} }
459 keys(%transcript_result) ) )
461 # my $score = $transcript_result{$coord_xref_id};
463 # if ( $score > $transcript_score_threshold ) {
464 # $best_score ||= $score;
466 # if ( sprintf( "%.3f", $score ) eq
467 # sprintf( "%.3f", $best_score ) )
469 if ( exists( $unmapped{$coord_xref_id} ) ) {
470 $mapped{$coord_xref_id} = $unmapped{$coord_xref_id};
471 delete( $unmapped{$coord_xref_id} );
472 $mapped{$coord_xref_id}{
'reason'} = undef;
473 $mapped{$coord_xref_id}{
'reason_full'} = undef;
476 push( @{ $mapped{$coord_xref_id}{
'mapped_to'} }, {
477 'ensembl_id' => $transcript->dbID(),
478 'ensembl_object_type' =>
'Transcript'
481 # # This is now a candidate Xref for the gene.
482 # if ( !defined( $gene_result{$coord_xref_id} )
483 # || $gene_result{$coord_xref_id} < $score )
485 # $gene_result{$coord_xref_id} = $score;
488 # } elsif ( exists( $unmapped{$coord_xref_id} ) ) {
489 # $unmapped{$coord_xref_id}{'reason'} =
490 # 'Was not best match';
491 # $unmapped{$coord_xref_id}{'reason_full'} =
493 # "Did not top best transcript match score (%.2f)",
495 # if ( !defined( $unmapped{$coord_xref_id}{'score'} )
496 # || $score > $unmapped{$coord_xref_id}{'score'} )
498 # $unmapped{$coord_xref_id}{'score'} = $score;
499 # $unmapped{$coord_xref_id}{'ensembl_id'} =
500 # $transcript->dbID();
504 # } elsif ( exists( $unmapped{$coord_xref_id} )
505 # && $unmapped{$coord_xref_id}{'reason'} ne
506 # 'Was not best match' )
508 # $unmapped{$coord_xref_id}{'reason'} =
509 # 'Did not meet threshold';
510 # $unmapped{$coord_xref_id}{'reason_full'} =
511 # sprintf( "Match score for transcript "
512 # . "lower than threshold (%.2f)",
513 # $transcript_score_threshold );
514 # if ( !defined( $unmapped{$coord_xref_id}{'score'} )
515 # || $score > $unmapped{$coord_xref_id}{'score'} )
517 # $unmapped{$coord_xref_id}{'score'} = $score;
518 # $unmapped{$coord_xref_id}{'ensembl_id'} =
519 # $transcript->dbID();
522 } ## end
foreach my $coord_xref_id (...
524 } ## end
foreach my $transcript ( sort...
526 #-----------------------------------------------------------------
527 # Pick the best match(es)
for this gene.
528 #-----------------------------------------------------------------
531 foreach my $coord_xref_id (
532 sort( { $gene_result{$b} <=> $gene_result{$a} }
533 keys(%gene_result) ) )
535 # my $score = $gene_result{$coord_xref_id};
537 # $best_score ||= $score;
540 # sprintf( "%.3f", $score ) eq sprintf( "%.3f", $best_score ) )
542 push( @{ $mapped{$coord_xref_id}{
'mapped_to'} }, {
543 'ensembl_id' => $gene->dbID(),
544 'ensembl_object_type' =>
'Gene'
549 } ## end
while ( my $gene = shift(...
550 } ## end
foreach my $chromosome (@chromosomes)
552 # Make all dumps. Order is important.
553 dump_xref( $xref_filename, $xref_id, \%mapped, \%unmapped );
554 dump_object_xref( $object_xref_filename, $object_xref_id, \%mapped );
555 dump_unmapped_reason( $unmapped_reason_filename, $unmapped_reason_id,
557 dump_unmapped_object( $unmapped_object_filename, $unmapped_object_id,
558 $analysis_id, \%unmapped );
561 upload_data(
'xref', $xref_filename, $core_dbh );
562 upload_data(
'object_xref', $object_xref_filename, $core_dbh );
563 upload_data(
'unmapped_reason', $unmapped_reason_filename,
565 upload_data(
'unmapped_object', $unmapped_object_filename,
569 } ## end sub run_coordinatemapping
571 #-----------------------------------------------------------------------
572 #-----------------------------------------------------------------------
575 my ( $filename, $xref_id, $mapped, $unmapped ) = @_;
577 ######################################################################
579 ######################################################################
581 my $fh = IO::File->new(
'>' . $filename )
582 or croak( sprintf(
"Can not open '%s' for writing", $filename ) );
584 log_progress(
"Dumping for 'xref' to '%s'\n", $filename );
586 foreach my $xref ( values( %{$unmapped} ), values( %{$mapped} ) ) {
587 # Assign 'xref_id' to this Xref.
588 $xref->{
'xref_id'} = ++$xref_id;
590 my $accession = $xref->{
'accession'};
592 my ($version) = ( $accession =~ /\.(\d+)$/ );
595 $fh->printf(
"%d\t%d\t%s\t%s\t%d\t%s\t%s\t%s\n",
597 $xref->{
'external_db_id'},
602 'COORDINATE_OVERLAP',
603 '\N' # FIXME (possibly)
608 log_progress(
"Dumping for 'xref' done\n");
610 } ## end sub dump_xref
612 #-----------------------------------------------------------------------
614 sub dump_object_xref {
615 my ( $filename, $object_xref_id, $mapped ) = @_;
617 ######################################################################
618 # Dump for 'object_xref'. #
619 ######################################################################
621 my $fh = IO::File->new(
'>' . $filename )
622 or croak( sprintf(
"Can not open '%s' for writing", $filename ) );
624 log_progress(
"Dumping for 'object_xref' to '%s'\n", $filename );
626 foreach my $xref ( values( %{$mapped} ) ) {
627 foreach my $object_xref ( @{ $xref->{
'mapped_to'} } ) {
628 # Assign 'object_xref_id' to this Object Xref.
629 $object_xref->{
'object_xref_id'} = ++$object_xref_id;
631 $fh->printf(
"%d\t%d\t%s\t%d\t%s\n",
632 $object_xref->{
'object_xref_id'},
633 $object_xref->{
'ensembl_id'},
634 $object_xref->{
'ensembl_object_type'},
641 log_progress(
"Dumping for 'object_xref' done\n");
643 } ## end sub dump_objexref
645 #-----------------------------------------------------------------------
647 sub dump_unmapped_reason {
648 my ( $filename, $unmapped_reason_id, $unmapped ) = @_;
650 ######################################################################
651 # Dump for 'unmapped_reason'. #
652 ######################################################################
654 # Create a list of the unique reasons.
657 foreach my $xref ( values( %{$unmapped} ) ) {
658 if ( !exists( $reasons{ $xref->{
'reason_full'} } ) ) {
659 $reasons{ $xref->{
'reason_full'} } = {
660 'summary' => $xref->{
'reason'},
661 'full' => $xref->{
'reason_full'}
666 my $fh = IO::File->new(
'>' . $filename )
667 or croak( sprintf(
"Can not open '%s' for writing", $filename ) );
669 log_progress(
"Dumping for 'unmapped_reason' to '%s'\n", $filename );
672 sort( { $a->{
'full'} cmp $b->{
'full'} } values(%reasons) ) )
674 # Assign 'unmapped_reason_id' to this reason.
675 $reason->{
'unmapped_reason_id'} = ++$unmapped_reason_id;
677 $fh->printf(
"%d\t%s\t%s\n", $reason->{
'unmapped_reason_id'},
678 $reason->{
'summary'}, $reason->{
'full'} );
683 log_progress(
"Dumping for 'unmapped_reason' done\n");
685 # Assign reasons to the unmapped Xrefs from %reasons.
686 foreach my $xref ( values( %{$unmapped} ) ) {
687 $xref->{
'reason'} = $reasons{ $xref->{
'reason_full'} };
688 $xref->{
'reason_full'} = undef;
691 } ## end sub dump_unmapped_reason
693 #-----------------------------------------------------------------------
695 sub dump_unmapped_object {
696 my ( $filename, $unmapped_object_id, $analysis_id, $unmapped ) = @_;
698 ######################################################################
699 # Dump for 'unmapped_object'. #
700 ######################################################################
702 my $fh = IO::File->new(
'>' . $filename )
703 or croak( sprintf(
"Can not open '%s' for writing", $filename ) );
705 log_progress(
"Dumping for 'unmapped_object' to '%s'\n", $filename );
707 foreach my $xref ( values( %{$unmapped} ) ) {
708 # Assign 'unmapped_object_id' to this Xref.
709 $xref->{
'unmapped_object_id'} = ++$unmapped_object_id;
712 "%d\t%s\t%s\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n",
713 $xref->{
'unmapped_object_id'},
715 $analysis_id ||
'\N', #
'\N' (NULL) means no analysis exists
716 # and uploading
this table will fail.
717 $xref->{
'external_db_id'},
718 $xref->{
'accession'},
719 $xref->{
'reason'}->{
'unmapped_reason_id'}, (
720 defined( $xref->{
'score'} )
721 ? sprintf(
"%.3f", $xref->{
'score'} )
725 $xref->{
'ensembl_id'} ||
'\N',
726 ( defined( $xref->{
'ensembl_id'} ) ?
'Transcript' :
'\N' ),
731 log_progress(
"Dumping for 'unmapped_object' done\n");
733 } ## end sub dump_unmapped_object
735 #-----------------------------------------------------------------------
738 my ( $table_name, $filename, $dbh ) = @_;
740 ######################################################################
741 # Upload data from a file to a table. #
742 ######################################################################
744 if ( !-r $filename ) {
745 croak( sprintf(
"Can not open '%s' for reading", $filename ) );
748 log_progress(
"Uploading for '%s' from '%s'\n",
749 $table_name, $filename );
752 sprintf(
"LOAD DATA LOCAL INFILE ? REPLACE INTO TABLE %s", $table_name );
754 my $sth = $dbh->prepare($sql);
756 $sth->execute($filename);
758 log_progress(
"Uploading for '%s' done\n", $table_name );
760 } ## end sub upload_data
762 #-----------------------------------------------------------------------
765 my ( $fmt, @params ) = @_;
766 printf( STDERR
"COORD==> %s", sprintf( $fmt, @params ) );