ensembl-hive  2.7.0
RefSeqCoordinateParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 package XrefParser::RefSeqCoordinateParser;
21 
22 use strict;
23 use warnings;
24 use Carp;
25 use DBI;
26 
27 use base qw( XrefParser::BaseParser );
29 
30 sub run_script {
31 
32  my ($self, $ref_arg) = @_;
33  my $source_id = $ref_arg->{source_id};
34  my $species_id = $ref_arg->{species_id};
35  my $species_name = $ref_arg->{species};
36  my $file = $ref_arg->{file};
37  my $verbose = $ref_arg->{verbose};
38  my $db = $ref_arg->{dba};
39  my $dbi = $ref_arg->{dbi};
40  $dbi = $self->dbi unless defined $dbi;
41 
42  if((!defined $source_id) or (!defined $species_id) or (!defined $file) ){
43  croak "Need to pass source_id, species_id and file as pairs";
44  }
45  $verbose |=0;
46 
47  my $peptide_source_id = $self->get_source_id_for_source_name('RefSeq_peptide', 'otherfeatures', $dbi);
48  my $mrna_source_id = $self->get_source_id_for_source_name('RefSeq_mRNA', 'otherfeatures', $dbi);
49  my $ncrna_source_id = $self->get_source_id_for_source_name('RefSeq_ncRNA', 'otherfeatures', $dbi);
50 
51  my $pred_peptide_source_id =
52  $self->get_source_id_for_source_name('RefSeq_peptide_predicted', 'otherfeatures', $dbi);
53  my $pred_mrna_source_id =
54  $self->get_source_id_for_source_name('RefSeq_mRNA_predicted','otherfeatures', $dbi);
55  my $pred_ncrna_source_id =
56  $self->get_source_id_for_source_name('RefSeq_ncRNA_predicted', 'otherfeatures', $dbi);
57 
58  if($verbose){
59  print "RefSeq_peptide source ID = $peptide_source_id\n";
60  print "RefSeq_mRNA source ID = $mrna_source_id\n";
61  print "RefSeq_ncRNA source ID = $ncrna_source_id\n";
62  print "RefSeq_peptide_predicted source ID = $pred_peptide_source_id\n";
63  print "RefSeq_mRNA_predicted source ID = $pred_mrna_source_id\n" ;
64  print "RefSeq_ncRNA_predicted source ID = $pred_ncrna_source_id\n" ;
65  }
66 
67  my $user = "ensro";
68  my $host;
69  my $port = 3306;
70  my $dbname;
71  my $pass;
72  my $transcript_score_threshold = 0.75;
73  my $tl_transcript_score_threshold = 0.75;
74  my $project;
75 
76 # Grep the project name, should be ensembl or ensemblgenomes
77  if($file =~ /project[=][>](\S+?)[,]/){
78  $project = $1;
79  }
80 
81 # If specified, get core database connection details
82  if($file =~ /host[=][>](\S+?)[,]/){
83  $host = $1;
84  }
85  if($file =~ /port[=][>](\S+?)[,]/){
86  $port = $1;
87  }
88  if($file =~ /dbname[=][>](\S+?)[,]/){
89  $dbname = $1;
90  }
91  if($file =~ /pass[=][>](\S+?)[,]/){
92  $pass = $1;
93  }
94  if($file =~ /user[=][>](\S+?)[,]/){
95  $user = $1;
96  }
97 
98  my $ofuser = 'ensro';
99  my $ofhost;
100  my $ofport = 3306;
101  my $ofdbname;
102  my $ofpass;
103 
104 # If specified, get otherfeatures database connection details
105  if($file =~ /ofhost[=][>](\S+?)[,]/){
106  $ofhost = $1;
107  }
108  if($file =~ /ofuser[=][>](\S+?)[,]/){
109  $ofuser = $1;
110  }
111  if($file =~ /ofport[=][>](\S+?)[,]/){
112  $ofport = $1;
113  }
114  if($file =~ /ofdbname[=][>](\S+?)[,]/){
115  $ofdbname = $1;
116  }
117  if($file =~ /ofpass[=][>](\S+?)[,]/){
118  $ofpass = $1;
119  }
120 
121  my $registry = "Bio::EnsEMBL::Registry";
122 
123  #get the species name
124  my %id2name = $self->species_id2name($dbi);
125  if (defined $species_name) { push @{$id2name{$species_id}}, $species_name; }
126  if (!defined $id2name{$species_id}) { next; }
127  $species_name = $id2name{$species_id}[0];
128 
129  my $core_dba;
130  my $otherf_dba;
131 
132  if (defined $project && $project eq 'ensembl') {
133 # Can use user-defined database
134  if (defined $host) {
135  $core_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(
136  '-host' => $host,
137  '-user' => $user,
138  '-pass' => $pass,
139  '-dbname' => $dbname,
140  '-port' => $port,
141  '-species' => $species_name.$host,
142  '-group' => 'core',
143  );
144  } else {
145 # Else, database should be on staging
146  $registry->load_registry_from_multiple_dbs(
147  {
148  -host => 'mysql-ens-sta-1',
149  '-port' => 4519,
150  -user => 'ensro',
151  },
152  );
153  $core_dba = $registry->get_DBAdaptor($species_name,'core');
154  }
155  if (defined $ofhost) {
156 # Can use user-defined database
157  $otherf_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(
158  '-host' => $ofhost,
159  '-user' => $ofuser,
160  '-pass' => $ofpass,
161  '-port' => $ofport,
162  '-dbname' => $ofdbname,
163  '-species' => $species_name,
164  '-group' => 'otherfeatures',
165  );
166  $otherf_dba->dnadb($core_dba);
167  } else {
168 # Else database should be on staging
169  $registry->load_registry_from_multiple_dbs(
170  {
171  -host => 'mysql-ens-sta-1',
172  '-port' => 4519,
173  -user => 'ensro',
174  },
175  );
176  $otherf_dba = $registry->get_DBAdaptor($species_name, 'otherfeatures') if !defined($ofhost);
177  if (defined $otherf_dba) { $otherf_dba->dnadb($core_dba); }
178  }
179 
180 
181  } elsif (defined $project && $project eq 'ensemblgenomes') {
182  $registry->load_registry_from_multiple_dbs(
183  {
184  -host => 'mysql-eg-staging-1.ebi.ac.uk',
185  -port => 4160,
186  -user => 'ensro',
187  },
188  {
189  -host => 'mysql-eg-staging-2.ebi.ac.uk',
190  -port => 4275,
191  -user => 'ensro',
192  },
193 
194  );
195  $core_dba = $registry->get_DBAdaptor($species_name,'core');
196  $otherf_dba = $registry->get_DBAdaptor($species_name, 'otherfeatures');
197 
198  } elsif (defined $db) {
199  $otherf_dba = $db;
200  $core_dba = $db->dnadb();
201  } else {
202  die("Missing or unsupported project value. Supported values: ensembl, ensemblgenomes");
203  }
204 
205 ## Not all species have an otherfeatures database, skip if not found
206  if (!$otherf_dba) {
207  print STDERR "No otherfeatures database for $species_name, skipping import for refseq_import data\n";
208  return;
209  }
210 
211 ## Add link to EntrezGene IDs where available
212  my (%entrez_ids) = %{ $self->get_valid_codes("EntrezGene", $species_id, $dbi) };
213  my $entrez_source_id = $self->get_source_id_for_source_name('EntrezGene', undef, $dbi);
214  my $add_dependent_xref_sth = $dbi->prepare("INSERT INTO dependent_xref (master_xref_id,dependent_xref_id, linkage_source_id) VALUES (?,?,?)");
215 
216  ## Add link to WikiGene IDs where available
217  my (%wiki_ids) = %{ $self->get_valid_codes('WikiGene', $species_id, $dbi) };
218  my $wiki_source_id = $self->get_source_id_for_source_name('WikiGene', undef, $dbi);
219 
220  my $sa = $core_dba->get_SliceAdaptor();
221  my $sa_of = $otherf_dba->get_SliceAdaptor();
222  my $chromosomes_of = $sa_of->fetch_all('toplevel', undef, 1);
223 
224 # Fetch analysis object for refseq
225  my $aa_of = $otherf_dba->get_AnalysisAdaptor();
226  my $logic_name;
227  foreach my $ana(@{ $aa_of->fetch_all() }) {
228  if ($ana->logic_name =~ /refseq_import/) {
229  $logic_name = $ana->logic_name;
230  }
231  }
232 ## Not all species have refseq_import data, skip if not found
233  if (!defined $logic_name) {
234  print STDERR "No data found for RefSeq_import, skipping import\n";;
235  return;
236  }
237 
238  foreach my $chromosome_of (@$chromosomes_of) {
239  my $chr_name = $chromosome_of->seq_region_name();
240  my $genes_of = $chromosome_of->get_all_Genes($logic_name, undef, 1);
241 
242  while (my $gene_of = shift @$genes_of) {
243  my $transcripts_of = $gene_of->get_all_Transcripts();
244 
245 # Create a range registry for all the exons of the refseq transcript
246  foreach my $transcript_of (sort { $a->start() <=> $b->start() } @$transcripts_of) {
247  my ($id, $tl_id);
248  # We're moving to RefSeq accessions being stored as xrefs rather than
249  # stable ids. But we also need to maintain backwards compatbility.
250  # If it's the new kind, where there's a display_xref use that,
251  # otherwise fall back to using the stable_id. But also check if we
252  # have neither, then skip the record.
253  if (defined $transcript_of->display_xref ) {
254  $id = $transcript_of->display_xref->display_id;
255  } elsif (defined $transcript_of->stable_id) {
256  $id = $transcript_of->stable_id;
257  } else {
258  # Skip non conventional accessions
259  next;
260  }
261  if ($id !~ /^[NXMR]{2}_[0-9]+/) { next; }
262  my %transcript_result;
263  my %tl_transcript_result;
264  if (!defined $id) { next; }
265  my $exons_of = $transcript_of->get_all_Exons();
267  my $tl_exons_of = $transcript_of->get_all_translateable_Exons();
269 
270  foreach my $exon_of (@$exons_of) {
271  my $start_of = $exon_of->seq_region_start();
272  my $end_of = $exon_of->seq_region_end();
273  $rr1->check_and_register( 'exon', $start_of, $end_of );
274  }
275 
276  foreach my $tl_exon_of (@$tl_exons_of) {
277  my $tl_start_of = $tl_exon_of->seq_region_start();
278  my $tl_end_of = $tl_exon_of->seq_region_end();
279  $rr3->check_and_register( 'exon', $tl_start_of, $tl_end_of );
280  }
281 
282 # Fetch slice in core database which overlaps refseq transcript
283  my $chromosome = $sa->fetch_by_region('toplevel', $chr_name, $transcript_of->seq_region_start, $transcript_of->seq_region_end);
284  my $transcripts = $chromosome->get_all_Transcripts(1);
285 
286 # Create a range registry for all the exons of the ensembl transcript
287  foreach my $transcript(@$transcripts) {
288  if ($transcript->strand != $transcript_of->strand) { next; }
289  my $exons = $transcript->get_all_Exons();
292  my $exon_match = 0;
293  my $tl_exons = $transcript->get_all_translateable_Exons();
294  my $tl_exon_match = 0;
295 
296  foreach my $exon (@$exons) {
297  my $start = $exon->seq_region_start();
298  my $end = $exon->seq_region_end();
299  my $overlap = $rr1->overlap_size('exon', $start, $end);
300  $exon_match += $overlap/($end - $start + 1);
301  $rr2->check_and_register('exon', $start, $end);
302  }
303 
304  foreach my $tl_exon (@$tl_exons) {
305  my $tl_start = $tl_exon->seq_region_start();
306  my $tl_end = $tl_exon->seq_region_end();
307  my $tl_overlap = $rr3->overlap_size('exon', $tl_start, $tl_end);
308  $tl_exon_match += $tl_overlap/($tl_end - $tl_start + 1);
309  $rr4->check_and_register('exon', $tl_start, $tl_end);
310  }
311 
312  my $exon_match_of = 0;
313  my $tl_exon_match_of = 0;
314 
315 # Look for oeverlap between the two sets of exons
316  foreach my $exon_of (@$exons_of) {
317  my $start_of = $exon_of->seq_region_start();
318  my $end_of = $exon_of->seq_region_end();
319  my $overlap_of = $rr2->overlap_size('exon', $start_of, $end_of);
320  $exon_match_of += $overlap_of/($end_of - $start_of + 1);
321  }
322 
323  foreach my $tl_exon_of (@$tl_exons_of) {
324  my $tl_start_of = $tl_exon_of->seq_region_start();
325  my $tl_end_of = $tl_exon_of->seq_region_end();
326  my $tl_overlap_of = $rr4->overlap_size('exon', $tl_start_of, $tl_end_of);
327  $tl_exon_match_of += $tl_overlap_of/($tl_end_of - $tl_start_of + 1);
328  }
329 
330 # Comparing exon matching with number of exons to give a score
331  my $score = ( ($exon_match_of + $exon_match)) / (scalar(@$exons_of) + scalar(@$exons) );
332  my $tl_score = 0;
333  if (scalar(@$tl_exons_of) > 0) {
334  $tl_score = ( ($tl_exon_match_of + $tl_exon_match)) / (scalar(@$tl_exons_of) + scalar(@$tl_exons) );
335  }
336  if ($transcript->biotype eq $transcript_of->biotype) {
337  $transcript_result{$transcript->stable_id} = $score;
338  $tl_transcript_result{$transcript->stable_id} = $tl_score;
339  } else {
340  $transcript_result{$transcript->stable_id} = $score * 0.90;
341  $tl_transcript_result{$transcript->stable_id} = $tl_score * 0.90;
342  }
343  }
344 
345  my $best_score = 0;
346  my $best_tl_score = 0;
347  my $best_id;
348  my ($score, $tl_score);
349 # Comparing the scores based on coding exon overlap
350 # If there is a stale mate, chose best exon overlap score
351  foreach my $tid (sort { $transcript_result{$b} <=> $transcript_result{$a} } keys(%transcript_result)) {
352  $score = $transcript_result{$tid};
353  $tl_score = $tl_transcript_result{$tid};
354  if ($score > $transcript_score_threshold || $tl_score > $tl_transcript_score_threshold) {
355  if ($tl_score >= $best_tl_score) {
356  if ($tl_score > $best_tl_score) {
357  $best_id = $tid;
358  $best_score = $score;
359  $best_tl_score = $tl_score;
360  } elsif ($tl_score == $best_tl_score) {
361  if ($score > $best_score) {
362  $best_id = $tid;
363  $best_score = $score;
364  }
365  }
366  }
367  if (!defined $best_id) {
368  if ($score >= $best_score) {
369  $best_id = $tid;
370  $best_score = $score;
371  }
372  }
373  }
374  }
375 
376 # If a best match was defined for the refseq transcript, store it as direct xref for ensembl transcript
377  if ($best_id) {
378  my ($acc, $version) = split(/\./, $id);
379  $version =~ s/\D//g if $version;
380  my $source_id;
381  $source_id = $mrna_source_id if $acc =~ /^NM_/;
382  $source_id = $ncrna_source_id if $acc =~ /^NR_/;
383  $source_id = $pred_mrna_source_id if $acc =~ /^XM_/;
384  $source_id = $pred_ncrna_source_id if $acc =~ /^XR_/;
385  # Accession should be of format NM_/XM_/NR_/XR_ otherwise it is not valid
386  if (!defined $source_id) { next; }
387  my $xref_id = $self->add_xref({ acc => $acc,
388  version => $version,
389  label => $id,
390  desc => undef,
391  source_id => $source_id,
392  species_id => $species_id,
393  dbi => $dbi,
394  info_type => 'DIRECT' });
395  $self->add_direct_xref($xref_id, $best_id, "Transcript", "", $dbi);
396 
397  my $t_of = $transcript_of;
398  my $g_of = $t_of->get_Gene();
399  my $entrez_id = $g_of->stable_id();
400  my $tl_of = $t_of->translation();
401  my $ta = $core_dba->get_TranscriptAdaptor();
402  my $t = $ta->fetch_by_stable_id($best_id);
403  my $tl = $t->translation();
404 
405 # Add link between Ensembl gene and EntrezGene
406  if (defined $entrez_ids{$entrez_id} ) {
407  foreach my $dependent_xref_id (@{$entrez_ids{$entrez_id}}) {
408  $add_dependent_xref_sth->execute($xref_id, $dependent_xref_id, $entrez_source_id);
409  }
410  foreach my $dependent_xref_id (@{$wiki_ids{$entrez_id}}) {
411  $add_dependent_xref_sth->execute($xref_id, $dependent_xref_id, $wiki_source_id);
412  }
413  }
414 
415 # Also store refseq protein as direct xref for ensembl translation, if translation exists
416  if (defined $tl && defined $tl_of) {
417  if ($tl_of->seq eq $tl->seq) {
418  $tl_id = $tl_of->stable_id();
419  my @xrefs = grep {$_->{dbname} eq 'GenBank'} @{$tl_of->get_all_DBEntries};
420  if(scalar @xrefs == 1) {
421  $tl_id = $xrefs[0]->primary_id();
422  }
423  ($acc, $version) = split(/\./, $tl_id);
424  $source_id = $peptide_source_id;
425  $source_id = $pred_peptide_source_id if $acc =~ /^XP_/;
426  my $tl_xref_id = $self->add_xref({ acc => $acc,
427  version => $version,
428  label => $tl_id,
429  desc => undef,
430  source_id => $source_id,
431  species_id => $species_id,
432  dbi => $dbi,
433  info_type => 'DIRECT' });
434  $self->add_direct_xref($tl_xref_id, $tl->stable_id(), "Translation", "", $dbi);
435  }
436  }
437  }
438  }
439  }
440  }
441  return 0;
442 }
443 
444 1;
Bio::EnsEMBL::Mapper::RangeRegistry
Definition: RangeRegistry.pm:51
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
XrefParser::BaseParser
Definition: BaseParser.pm:8
Bio::EnsEMBL::Registry
Definition: Registry.pm:113
Bio::EnsEMBL::Mapper::RangeRegistry::overlap_size
public Int overlap_size()
Bio::EnsEMBL::DBSQL::DBAdaptor::new
public Bio::EnsEMBL::DBSQL::DBAdaptor new()
Bio::EnsEMBL::DBSQL::DBAdaptor::dnadb
public Dna dnadb()
Bio::EnsEMBL::Mapper::RangeRegistry::new
public Bio::EnsEMBL::Mapper::RangeRegistry new()
Bio::EnsEMBL::Mapper::RangeRegistry::check_and_register
public Undef check_and_register()