Public Member Functions
public	run_script ()
Detailed Description

Definition at line 5 of file RefSeqCoordinateParser.pm.
Member Function Documentation

◆ run_script()

public XrefParser::RefSeqCoordinateParser::run_script ( )
Undocumented method
Code:
click to view
sub run_script {
 
  my ($self, $ref_arg) = @_;
  my $source_id    = $ref_arg->{source_id};
  my $species_id   = $ref_arg->{species_id};
  my $species_name = $ref_arg->{species};
  my $file         = $ref_arg->{file};
  my $verbose      = $ref_arg->{verbose};
  my $db           = $ref_arg->{dba};
  my $dbi          = $ref_arg->{dbi};
  $dbi = $self->dbi unless defined $dbi;
 
  if((!defined $source_id) or (!defined $species_id) or (!defined $file) ){
    croak "Need to pass source_id, species_id and file as pairs";
  }
  $verbose |=0;
 
  my $peptide_source_id = $self->get_source_id_for_source_name('RefSeq_peptide', 'otherfeatures', $dbi);
  my $mrna_source_id = $self->get_source_id_for_source_name('RefSeq_mRNA', 'otherfeatures', $dbi);
  my $ncrna_source_id = $self->get_source_id_for_source_name('RefSeq_ncRNA', 'otherfeatures', $dbi);
 
  my $pred_peptide_source_id =
    $self->get_source_id_for_source_name('RefSeq_peptide_predicted', 'otherfeatures', $dbi);
  my $pred_mrna_source_id =
    $self->get_source_id_for_source_name('RefSeq_mRNA_predicted','otherfeatures', $dbi);
  my $pred_ncrna_source_id =
    $self->get_source_id_for_source_name('RefSeq_ncRNA_predicted', 'otherfeatures', $dbi);
 
  if($verbose){
    print "RefSeq_peptide source ID = $peptide_source_id\n";
    print "RefSeq_mRNA source ID = $mrna_source_id\n";
    print "RefSeq_ncRNA source ID = $ncrna_source_id\n";
    print "RefSeq_peptide_predicted source ID = $pred_peptide_source_id\n";
    print "RefSeq_mRNA_predicted source ID = $pred_mrna_source_id\n" ;
    print "RefSeq_ncRNA_predicted source ID = $pred_ncrna_source_id\n" ;
  }
 
  my $user = "ensro";
  my $host;
  my $port = 3306;
  my $dbname;
  my $pass;
  my $transcript_score_threshold = 0.75;
  my $tl_transcript_score_threshold = 0.75;
  my $project;
 
# Grep the project name, should be ensembl or ensemblgenomes
  if($file =~ /project[=][>](\S+?)[,]/){
    $project = $1;
  }
 
# If specified, get core database connection details
  if($file =~ /host[=][>](\S+?)[,]/){
    $host = $1;
  }
  if($file =~ /port[=][>](\S+?)[,]/){
    $port =  $1;
  }
  if($file =~ /dbname[=][>](\S+?)[,]/){
    $dbname = $1;
  }
  if($file =~ /pass[=][>](\S+?)[,]/){
    $pass = $1;
  }
  if($file =~ /user[=][>](\S+?)[,]/){
    $user = $1;
  }
 
  my $ofuser = 'ensro';
  my $ofhost;
  my $ofport = 3306;
  my $ofdbname;
  my $ofpass;
 
# If specified, get otherfeatures database connection details
  if($file =~ /ofhost[=][>](\S+?)[,]/){
    $ofhost = $1;
  }
  if($file =~ /ofuser[=][>](\S+?)[,]/){
    $ofuser = $1;
  }
  if($file =~ /ofport[=][>](\S+?)[,]/){
    $ofport =  $1;
  }
  if($file =~ /ofdbname[=][>](\S+?)[,]/){
    $ofdbname = $1;
  }
  if($file =~ /ofpass[=][>](\S+?)[,]/){
    $ofpass = $1;
  }
 
  my $registry = "Bio::EnsEMBL::Registry";
 
  #get the species name
  my %id2name = $self->species_id2name($dbi);
  if (defined $species_name) { push @{$id2name{$species_id}}, $species_name; }
  if (!defined $id2name{$species_id}) { next; }
  $species_name = $id2name{$species_id}[0];
 
  my $core_dba;
  my $otherf_dba;
 
  if (defined $project && $project eq 'ensembl') {
# Can use user-defined database
      if (defined $host) {
          $core_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(
              '-host'     => $host,
              '-user'     => $user,
              '-pass'     => $pass,
              '-dbname'   => $dbname,
              '-port'     => $port,
              '-species'  => $species_name.$host,
              '-group'    => 'core',
       );
      } else {
# Else, database should be on staging
      $registry->load_registry_from_multiple_dbs(
          {
              -host    => 'mysql-ens-sta-1',
          '-port'    => 4519,
              -user    => 'ensro',
          },
       );
      $core_dba = $registry->get_DBAdaptor($species_name,'core');
      }
      if (defined $ofhost) {
# Can use user-defined database
          $otherf_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(
              '-host'     => $ofhost,
              '-user'     => $ofuser,
              '-pass'     => $ofpass,
              '-port'     => $ofport,
              '-dbname'   => $ofdbname,
              '-species'  => $species_name,
              '-group'    => 'otherfeatures',
       );
       $otherf_dba->dnadb($core_dba);
      } else {
# Else database should be on staging
      $registry->load_registry_from_multiple_dbs( 
      {
          -host    => 'mysql-ens-sta-1',
          '-port'    => 4519,
          -user    => 'ensro',
      },
       );
      $otherf_dba = $registry->get_DBAdaptor($species_name, 'otherfeatures') if !defined($ofhost);
      if (defined $otherf_dba) { $otherf_dba->dnadb($core_dba); }
    }
      
 
  } elsif (defined $project && $project eq 'ensemblgenomes') {
      $registry->load_registry_from_multiple_dbs( 
      {
          -host     => 'mysql-eg-staging-1.ebi.ac.uk',
          -port     => 4160,
          -user     => 'ensro',
      },
      {
          -host     => 'mysql-eg-staging-2.ebi.ac.uk',
          -port     => 4275,
          -user     => 'ensro',
      },
 
      );
      $core_dba = $registry->get_DBAdaptor($species_name,'core');
      $otherf_dba = $registry->get_DBAdaptor($species_name, 'otherfeatures');     
 
  } elsif (defined $db) {
    $otherf_dba = $db;
    $core_dba = $db->dnadb();
  } else {
      die("Missing or unsupported project value. Supported values: ensembl, ensemblgenomes");
  }
 
## Not all species have an otherfeatures database, skip if not found
  if (!$otherf_dba) {
    print STDERR "No otherfeatures database for $species_name, skipping import for refseq_import data\n";
    return;
  }
 
## Add link to EntrezGene IDs where available
  my (%entrez_ids) = %{ $self->get_valid_codes("EntrezGene", $species_id, $dbi) };
  my $entrez_source_id = $self->get_source_id_for_source_name('EntrezGene', undef, $dbi);
  my $add_dependent_xref_sth = $dbi->prepare("INSERT INTO dependent_xref  (master_xref_id,dependent_xref_id, linkage_source_id) VALUES (?,?,?)");
 
  ## Add link to WikiGene IDs where available
  my (%wiki_ids) = %{ $self->get_valid_codes('WikiGene', $species_id, $dbi) };
  my $wiki_source_id = $self->get_source_id_for_source_name('WikiGene', undef, $dbi);
 
  my $sa = $core_dba->get_SliceAdaptor();
  my $sa_of = $otherf_dba->get_SliceAdaptor();
  my $chromosomes_of = $sa_of->fetch_all('toplevel', undef, 1);
 
# Fetch analysis object for refseq
  my $aa_of = $otherf_dba->get_AnalysisAdaptor();
  my $logic_name;
  foreach my $ana(@{ $aa_of->fetch_all() }) {
    if ($ana->logic_name =~ /refseq_import/) {
      $logic_name = $ana->logic_name;
    }
  }
## Not all species have refseq_import data, skip if not found
  if (!defined $logic_name) {
    print STDERR "No data found for RefSeq_import, skipping import\n";;
    return;
  }
 
  foreach my $chromosome_of (@$chromosomes_of) {
    my $chr_name = $chromosome_of->seq_region_name();
    my $genes_of = $chromosome_of->get_all_Genes($logic_name, undef, 1);
 
    while (my $gene_of = shift @$genes_of) {
      my $transcripts_of = $gene_of->get_all_Transcripts();
 
# Create a range registry for all the exons of the refseq transcript
      foreach my $transcript_of (sort { $a->start() <=> $b->start() } @$transcripts_of) {
    my ($id, $tl_id);
    # We're moving to RefSeq accessions being stored as xrefs rather than
    # stable ids. But we also need to maintain backwards compatbility.
    # If it's the new kind, where there's a display_xref use that,
    # otherwise fall back to using the stable_id. But also check if we
    # have neither, then skip the record.
    if (defined $transcript_of->display_xref ) {
      $id = $transcript_of->display_xref->display_id;
    } elsif (defined $transcript_of->stable_id) {
      $id = $transcript_of->stable_id;
    } else {
      # Skip non conventional accessions
      next;
    }
        if ($id !~ /^[NXMR]{2}_[0-9]+/)  { next; }
        my %transcript_result;
        my %tl_transcript_result;
        if (!defined $id) { next; }
        my $exons_of = $transcript_of->get_all_Exons();
        my $rr1 = Bio::EnsEMBL::Mapper::RangeRegistry->new();
        my $tl_exons_of = $transcript_of->get_all_translateable_Exons();
        my $rr3 = Bio::EnsEMBL::Mapper::RangeRegistry->new();
 
        foreach my $exon_of (@$exons_of) {
          my $start_of = $exon_of->seq_region_start();
          my $end_of = $exon_of->seq_region_end();
          $rr1->check_and_register( 'exon', $start_of, $end_of );
        }
 
        foreach my $tl_exon_of (@$tl_exons_of) {
          my $tl_start_of = $tl_exon_of->seq_region_start();
          my $tl_end_of = $tl_exon_of->seq_region_end();
          $rr3->check_and_register( 'exon', $tl_start_of, $tl_end_of );
        }
 
# Fetch slice in core database which overlaps refseq transcript
        my $chromosome = $sa->fetch_by_region('toplevel', $chr_name, $transcript_of->seq_region_start, $transcript_of->seq_region_end);
        my $transcripts = $chromosome->get_all_Transcripts(1);
 
# Create a range registry for all the exons of the ensembl transcript
        foreach my $transcript(@$transcripts) {
          if ($transcript->strand != $transcript_of->strand) { next; }
          my $exons = $transcript->get_all_Exons();
          my $rr2 = Bio::EnsEMBL::Mapper::RangeRegistry->new();
          my $rr4 = Bio::EnsEMBL::Mapper::RangeRegistry->new();
          my $exon_match = 0;
          my $tl_exons = $transcript->get_all_translateable_Exons();
          my $tl_exon_match = 0;
 
          foreach my $exon (@$exons) {
            my $start = $exon->seq_region_start();
            my $end = $exon->seq_region_end();
            my $overlap = $rr1->overlap_size('exon', $start, $end);
            $exon_match += $overlap/($end - $start + 1);
            $rr2->check_and_register('exon', $start, $end);
          }
 
          foreach my $tl_exon (@$tl_exons) {
            my $tl_start = $tl_exon->seq_region_start();
            my $tl_end = $tl_exon->seq_region_end();
            my $tl_overlap = $rr3->overlap_size('exon', $tl_start, $tl_end);
            $tl_exon_match += $tl_overlap/($tl_end - $tl_start + 1);
            $rr4->check_and_register('exon', $tl_start, $tl_end);
          }
 
          my $exon_match_of = 0;
          my $tl_exon_match_of = 0;
 
# Look for oeverlap between the two sets of exons
          foreach my $exon_of (@$exons_of) {
            my $start_of = $exon_of->seq_region_start();
            my $end_of = $exon_of->seq_region_end();
            my $overlap_of = $rr2->overlap_size('exon', $start_of, $end_of);
            $exon_match_of += $overlap_of/($end_of - $start_of + 1);
          }
 
          foreach my $tl_exon_of (@$tl_exons_of) {
            my $tl_start_of = $tl_exon_of->seq_region_start();
            my $tl_end_of = $tl_exon_of->seq_region_end();
            my $tl_overlap_of = $rr4->overlap_size('exon', $tl_start_of, $tl_end_of);
            $tl_exon_match_of += $tl_overlap_of/($tl_end_of - $tl_start_of + 1);
          }
 
# Comparing exon matching with number of exons to give a score
          my $score = ( ($exon_match_of + $exon_match)) / (scalar(@$exons_of) + scalar(@$exons) );
          my $tl_score = 0;
          if (scalar(@$tl_exons_of) > 0) {
            $tl_score = ( ($tl_exon_match_of + $tl_exon_match)) / (scalar(@$tl_exons_of) + scalar(@$tl_exons) );
          }
          if ($transcript->biotype eq $transcript_of->biotype) {
            $transcript_result{$transcript->stable_id} = $score;
            $tl_transcript_result{$transcript->stable_id} = $tl_score;
          } else {
            $transcript_result{$transcript->stable_id} = $score * 0.90;
            $tl_transcript_result{$transcript->stable_id} = $tl_score * 0.90;
          }
        }
 
        my $best_score = 0;
        my $best_tl_score = 0;
        my $best_id;
        my ($score, $tl_score);
# Comparing the scores based on coding exon overlap
# If there is a stale mate, chose best exon overlap score
        foreach my $tid (sort { $transcript_result{$b} <=> $transcript_result{$a} } keys(%transcript_result)) {
          $score = $transcript_result{$tid};
          $tl_score = $tl_transcript_result{$tid};
          if ($score > $transcript_score_threshold || $tl_score > $tl_transcript_score_threshold) {
            if ($tl_score >= $best_tl_score) {
              if ($tl_score > $best_tl_score) {
                $best_id = $tid;
                $best_score = $score;
                $best_tl_score = $tl_score;
              } elsif ($tl_score == $best_tl_score) {
                if ($score > $best_score) {
                  $best_id = $tid;
                  $best_score = $score;
                }
              }
            }
            if (!defined $best_id) { 
              if ($score >= $best_score) {
                $best_id = $tid;
                $best_score = $score;
              }
            }
          }
        }
 
# If a best match was defined for the refseq transcript, store it as direct xref for ensembl transcript
        if ($best_id) {
          my ($acc, $version) = split(/\./, $id);
      $version =~ s/\D//g if $version;
          my $source_id;
          $source_id = $mrna_source_id if $acc =~ /^NM_/;
          $source_id = $ncrna_source_id if $acc =~ /^NR_/;
          $source_id = $pred_mrna_source_id if $acc =~ /^XM_/;
          $source_id = $pred_ncrna_source_id if $acc =~ /^XR_/;
          # Accession should be of format NM_/XM_/NR_/XR_ otherwise it is not valid
          if (!defined $source_id) { next; }
          my $xref_id = $self->add_xref({ acc => $acc,
                                          version => $version,
                                          label => $id,
                                          desc => undef,
                                          source_id => $source_id,
                                          species_id => $species_id,
                                          dbi => $dbi,
                                          info_type => 'DIRECT' });
          $self->add_direct_xref($xref_id, $best_id, "Transcript", "", $dbi);
 
      my $t_of = $transcript_of;
          my $g_of = $t_of->get_Gene();
          my $entrez_id = $g_of->stable_id();
          my $tl_of = $t_of->translation();
          my $ta = $core_dba->get_TranscriptAdaptor();
          my $t = $ta->fetch_by_stable_id($best_id);
          my $tl = $t->translation();
 
# Add link between Ensembl gene and EntrezGene
          if (defined $entrez_ids{$entrez_id} ) {
            foreach my $dependent_xref_id (@{$entrez_ids{$entrez_id}}) {
              $add_dependent_xref_sth->execute($xref_id, $dependent_xref_id, $entrez_source_id);
            }
            foreach my $dependent_xref_id (@{$wiki_ids{$entrez_id}}) {
              $add_dependent_xref_sth->execute($xref_id, $dependent_xref_id, $wiki_source_id);
            }
          }
 
# Also store refseq protein as direct xref for ensembl translation, if translation exists
          if (defined $tl && defined $tl_of) {
            if ($tl_of->seq eq $tl->seq) {
              $tl_id = $tl_of->stable_id();
              my @xrefs = grep {$_->{dbname} eq 'GenBank'} @{$tl_of->get_all_DBEntries};
              if(scalar @xrefs == 1) {
                $tl_id = $xrefs[0]->primary_id();
              }
              ($acc, $version) = split(/\./, $tl_id);
              $source_id = $peptide_source_id;
              $source_id = $pred_peptide_source_id if $acc =~ /^XP_/;
              my $tl_xref_id = $self->add_xref({ acc => $acc,
                                              version => $version,
                                              label => $tl_id,
                                              desc => undef,
                                              source_id => $source_id,
                                              species_id => $species_id,
                                              dbi => $dbi,
                                              info_type => 'DIRECT' });
              $self->add_direct_xref($tl_xref_id, $tl->stable_id(), "Translation", "", $dbi);
            }
          }
        }
      }
    }
  }
  return 0;
}
The documentation for this class was generated from the following file:
ensembl/misc-scripts/xref_mapping/XrefParser/RefSeqCoordinateParser.pm
Public Member Functions

Detailed Description

Member Function Documentation

◆ run_script()