3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
20 package XrefParser::RefSeqCoordinateParser;
32 my ($self, $ref_arg) = @_;
33 my $source_id = $ref_arg->{source_id};
34 my $species_id = $ref_arg->{species_id};
35 my $species_name = $ref_arg->{species};
36 my $file = $ref_arg->{file};
37 my $verbose = $ref_arg->{verbose};
38 my $db = $ref_arg->{dba};
39 my $dbi = $ref_arg->{dbi};
40 $dbi = $self->dbi unless defined $dbi;
42 if((!defined $source_id) or (!defined $species_id) or (!defined $file) ){
43 croak
"Need to pass source_id, species_id and file as pairs";
47 my $peptide_source_id = $self->get_source_id_for_source_name(
'RefSeq_peptide',
'otherfeatures', $dbi);
48 my $mrna_source_id = $self->get_source_id_for_source_name(
'RefSeq_mRNA',
'otherfeatures', $dbi);
49 my $ncrna_source_id = $self->get_source_id_for_source_name(
'RefSeq_ncRNA',
'otherfeatures', $dbi);
51 my $pred_peptide_source_id =
52 $self->get_source_id_for_source_name(
'RefSeq_peptide_predicted',
'otherfeatures', $dbi);
53 my $pred_mrna_source_id =
54 $self->get_source_id_for_source_name(
'RefSeq_mRNA_predicted',
'otherfeatures', $dbi);
55 my $pred_ncrna_source_id =
56 $self->get_source_id_for_source_name(
'RefSeq_ncRNA_predicted',
'otherfeatures', $dbi);
59 print
"RefSeq_peptide source ID = $peptide_source_id\n";
60 print
"RefSeq_mRNA source ID = $mrna_source_id\n";
61 print
"RefSeq_ncRNA source ID = $ncrna_source_id\n";
62 print
"RefSeq_peptide_predicted source ID = $pred_peptide_source_id\n";
63 print
"RefSeq_mRNA_predicted source ID = $pred_mrna_source_id\n" ;
64 print
"RefSeq_ncRNA_predicted source ID = $pred_ncrna_source_id\n" ;
72 my $transcript_score_threshold = 0.75;
73 my $tl_transcript_score_threshold = 0.75;
76 # Grep the project name, should be ensembl or ensemblgenomes
77 if($file =~ /project[=][>](\S+?)[,]/){
81 # If specified, get core database connection details
82 if($file =~ /host[=][>](\S+?)[,]/){
85 if($file =~ /port[=][>](\S+?)[,]/){
88 if($file =~ /dbname[=][>](\S+?)[,]/){
91 if($file =~ /pass[=][>](\S+?)[,]/){
94 if($file =~ /user[=][>](\S+?)[,]/){
104 # If specified, get otherfeatures database connection details
105 if($file =~ /ofhost[=][>](\S+?)[,]/){
108 if($file =~ /ofuser[=][>](\S+?)[,]/){
111 if($file =~ /ofport[=][>](\S+?)[,]/){
114 if($file =~ /ofdbname[=][>](\S+?)[,]/){
117 if($file =~ /ofpass[=][>](\S+?)[,]/){
121 my $registry =
"Bio::EnsEMBL::Registry";
123 #get the species name
124 my %id2name = $self->species_id2name($dbi);
125 if (defined $species_name) { push @{$id2name{$species_id}}, $species_name; }
126 if (!defined $id2name{$species_id}) { next; }
127 $species_name = $id2name{$species_id}[0];
132 if (defined $project && $project eq
'ensembl') {
133 # Can use user-defined database
139 '-dbname' => $dbname,
141 '-species' => $species_name.$host,
145 # Else, database should be on staging
146 $registry->load_registry_from_multiple_dbs(
148 -host =>
'mysql-ens-sta-1',
153 $core_dba = $registry->get_DBAdaptor($species_name,
'core');
155 if (defined $ofhost) {
156 # Can use user-defined database
162 '-dbname' => $ofdbname,
163 '-species' => $species_name,
164 '-group' =>
'otherfeatures',
166 $otherf_dba->
dnadb($core_dba);
168 # Else database should be on staging
169 $registry->load_registry_from_multiple_dbs(
171 -host =>
'mysql-ens-sta-1',
176 $otherf_dba = $registry->get_DBAdaptor($species_name,
'otherfeatures')
if !defined($ofhost);
177 if (defined $otherf_dba) { $otherf_dba->dnadb($core_dba); }
181 } elsif (defined $project && $project eq
'ensemblgenomes') {
182 $registry->load_registry_from_multiple_dbs(
184 -host =>
'mysql-eg-staging-1.ebi.ac.uk',
189 -host =>
'mysql-eg-staging-2.ebi.ac.uk',
195 $core_dba = $registry->get_DBAdaptor($species_name,
'core');
196 $otherf_dba = $registry->get_DBAdaptor($species_name,
'otherfeatures');
198 } elsif (defined $db) {
200 $core_dba = $db->dnadb();
202 die(
"Missing or unsupported project value. Supported values: ensembl, ensemblgenomes");
205 ## Not all species have an otherfeatures database, skip if not found
207 print STDERR
"No otherfeatures database for $species_name, skipping import for refseq_import data\n";
211 ## Add link to EntrezGene IDs where available
212 my (%entrez_ids) = %{ $self->get_valid_codes(
"EntrezGene", $species_id, $dbi) };
213 my $entrez_source_id = $self->get_source_id_for_source_name(
'EntrezGene', undef, $dbi);
214 my $add_dependent_xref_sth = $dbi->prepare(
"INSERT INTO dependent_xref (master_xref_id,dependent_xref_id, linkage_source_id) VALUES (?,?,?)");
216 ## Add link to WikiGene IDs where available
217 my (%wiki_ids) = %{ $self->get_valid_codes(
'WikiGene', $species_id, $dbi) };
218 my $wiki_source_id = $self->get_source_id_for_source_name(
'WikiGene', undef, $dbi);
220 my $sa = $core_dba->get_SliceAdaptor();
221 my $sa_of = $otherf_dba->get_SliceAdaptor();
222 my $chromosomes_of = $sa_of->fetch_all(
'toplevel', undef, 1);
224 # Fetch analysis object for refseq
225 my $aa_of = $otherf_dba->get_AnalysisAdaptor();
227 foreach my $ana(@{ $aa_of->fetch_all() }) {
228 if ($ana->logic_name =~ /refseq_import/) {
229 $logic_name = $ana->logic_name;
232 ## Not all species have refseq_import data, skip if not found
233 if (!defined $logic_name) {
234 print STDERR
"No data found for RefSeq_import, skipping import\n";;
238 foreach my $chromosome_of (@$chromosomes_of) {
239 my $chr_name = $chromosome_of->seq_region_name();
240 my $genes_of = $chromosome_of->get_all_Genes($logic_name, undef, 1);
242 while (my $gene_of = shift @$genes_of) {
243 my $transcripts_of = $gene_of->get_all_Transcripts();
245 # Create a range registry for all the exons of the refseq transcript
246 foreach my $transcript_of (sort { $a->start() <=> $b->start() } @$transcripts_of) {
248 # We're moving to RefSeq accessions being stored as xrefs rather than
249 # stable ids. But we also need to maintain backwards compatbility.
250 # If it's the new kind, where there's a display_xref use that,
251 # otherwise fall back to using the stable_id. But also check if we
252 # have neither, then skip the record.
253 if (defined $transcript_of->display_xref ) {
254 $id = $transcript_of->display_xref->display_id;
255 } elsif (defined $transcript_of->stable_id) {
256 $id = $transcript_of->stable_id;
258 # Skip non conventional accessions
261 if ($id !~ /^[NXMR]{2}_[0-9]+/) { next; }
262 my %transcript_result;
263 my %tl_transcript_result;
264 if (!defined $id) { next; }
265 my $exons_of = $transcript_of->get_all_Exons();
267 my $tl_exons_of = $transcript_of->get_all_translateable_Exons();
270 foreach my $exon_of (@$exons_of) {
271 my $start_of = $exon_of->seq_region_start();
272 my $end_of = $exon_of->seq_region_end();
276 foreach my $tl_exon_of (@$tl_exons_of) {
277 my $tl_start_of = $tl_exon_of->seq_region_start();
278 my $tl_end_of = $tl_exon_of->seq_region_end();
279 $rr3->check_and_register(
'exon', $tl_start_of, $tl_end_of );
282 # Fetch slice in core database which overlaps refseq transcript
283 my $chromosome = $sa->fetch_by_region(
'toplevel', $chr_name, $transcript_of->seq_region_start, $transcript_of->seq_region_end);
284 my $transcripts = $chromosome->get_all_Transcripts(1);
286 # Create a range registry for all the exons of the ensembl transcript
287 foreach my $transcript(@$transcripts) {
288 if ($transcript->strand != $transcript_of->strand) { next; }
289 my $exons = $transcript->get_all_Exons();
293 my $tl_exons = $transcript->get_all_translateable_Exons();
294 my $tl_exon_match = 0;
296 foreach my $exon (@$exons) {
297 my $start = $exon->seq_region_start();
298 my $end = $exon->seq_region_end();
300 $exon_match += $overlap/($end - $start + 1);
301 $rr2->check_and_register(
'exon', $start, $end);
304 foreach my $tl_exon (@$tl_exons) {
305 my $tl_start = $tl_exon->seq_region_start();
306 my $tl_end = $tl_exon->seq_region_end();
307 my $tl_overlap = $rr3->overlap_size(
'exon', $tl_start, $tl_end);
308 $tl_exon_match += $tl_overlap/($tl_end - $tl_start + 1);
309 $rr4->check_and_register(
'exon', $tl_start, $tl_end);
312 my $exon_match_of = 0;
313 my $tl_exon_match_of = 0;
315 # Look for oeverlap between the two sets of exons
316 foreach my $exon_of (@$exons_of) {
317 my $start_of = $exon_of->seq_region_start();
318 my $end_of = $exon_of->seq_region_end();
319 my $overlap_of = $rr2->overlap_size(
'exon', $start_of, $end_of);
320 $exon_match_of += $overlap_of/($end_of - $start_of + 1);
323 foreach my $tl_exon_of (@$tl_exons_of) {
324 my $tl_start_of = $tl_exon_of->seq_region_start();
325 my $tl_end_of = $tl_exon_of->seq_region_end();
326 my $tl_overlap_of = $rr4->overlap_size(
'exon', $tl_start_of, $tl_end_of);
327 $tl_exon_match_of += $tl_overlap_of/($tl_end_of - $tl_start_of + 1);
330 # Comparing exon matching with number of exons to give a score
331 my $score = ( ($exon_match_of + $exon_match)) / (scalar(@$exons_of) + scalar(@$exons) );
333 if (scalar(@$tl_exons_of) > 0) {
334 $tl_score = ( ($tl_exon_match_of + $tl_exon_match)) / (scalar(@$tl_exons_of) + scalar(@$tl_exons) );
336 if ($transcript->biotype eq $transcript_of->biotype) {
337 $transcript_result{$transcript->stable_id} = $score;
338 $tl_transcript_result{$transcript->stable_id} = $tl_score;
340 $transcript_result{$transcript->stable_id} = $score * 0.90;
341 $tl_transcript_result{$transcript->stable_id} = $tl_score * 0.90;
346 my $best_tl_score = 0;
348 my ($score, $tl_score);
349 # Comparing the scores based on coding exon overlap
350 # If there is a stale mate, chose best exon overlap score
351 foreach my $tid (sort { $transcript_result{$b} <=> $transcript_result{$a} } keys(%transcript_result)) {
352 $score = $transcript_result{$tid};
353 $tl_score = $tl_transcript_result{$tid};
354 if ($score > $transcript_score_threshold || $tl_score > $tl_transcript_score_threshold) {
355 if ($tl_score >= $best_tl_score) {
356 if ($tl_score > $best_tl_score) {
358 $best_score = $score;
359 $best_tl_score = $tl_score;
360 } elsif ($tl_score == $best_tl_score) {
361 if ($score > $best_score) {
363 $best_score = $score;
367 if (!defined $best_id) {
368 if ($score >= $best_score) {
370 $best_score = $score;
376 # If a best match was defined for the refseq transcript, store it as direct xref for ensembl transcript
378 my ($acc, $version) = split(/\./, $id);
381 $source_id = $mrna_source_id
if $acc =~ /^NM_/;
382 $source_id = $ncrna_source_id
if $acc =~ /^NR_/;
383 $source_id = $pred_mrna_source_id
if $acc =~ /^XM_/;
384 $source_id = $pred_ncrna_source_id
if $acc =~ /^XR_/;
385 # Accession should be of format NM_/XM_/NR_/XR_ otherwise it is not valid
386 if (!defined $source_id) { next; }
387 my $xref_id = $self->add_xref({ acc => $acc,
391 source_id => $source_id,
392 species_id => $species_id,
394 info_type =>
'DIRECT' });
395 $self->add_direct_xref($xref_id, $best_id,
"Transcript",
"", $dbi);
397 my $t_of = $transcript_of;
398 my $g_of = $t_of->get_Gene();
399 my $entrez_id = $g_of->stable_id();
400 my $tl_of = $t_of->translation();
401 my $ta = $core_dba->get_TranscriptAdaptor();
402 my $t = $ta->fetch_by_stable_id($best_id);
403 my $tl = $t->translation();
405 # Add link between Ensembl gene and EntrezGene
406 if (defined $entrez_ids{$entrez_id} ) {
407 foreach my $dependent_xref_id (@{$entrez_ids{$entrez_id}}) {
408 $add_dependent_xref_sth->execute($xref_id, $dependent_xref_id, $entrez_source_id);
410 foreach my $dependent_xref_id (@{$wiki_ids{$entrez_id}}) {
411 $add_dependent_xref_sth->execute($xref_id, $dependent_xref_id, $wiki_source_id);
415 # Also store refseq protein as direct xref for ensembl translation, if translation exists
416 if (defined $tl && defined $tl_of) {
417 if ($tl_of->seq eq $tl->seq) {
418 $tl_id = $tl_of->stable_id();
419 my @xrefs = grep {$_->{dbname} eq
'GenBank'} @{$tl_of->get_all_DBEntries};
420 if(scalar @xrefs == 1) {
421 $tl_id = $xrefs[0]->primary_id();
423 ($acc, $version) = split(/\./, $tl_id);
424 $source_id = $peptide_source_id;
425 $source_id = $pred_peptide_source_id
if $acc =~ /^XP_/;
426 my $tl_xref_id = $self->add_xref({ acc => $acc,
430 source_id => $source_id,
431 species_id => $species_id,
433 info_type =>
'DIRECT' });
434 $self->add_direct_xref($tl_xref_id, $tl->stable_id(),
"Translation",
"", $dbi);