my ($self, $ref_arg) = @_;
my $source_id = $ref_arg->{source_id};
my $species_id = $ref_arg->{species_id};
my $species_name = $ref_arg->{species};
my $file = $ref_arg->{file};
my $verbose = $ref_arg->{verbose};
my $db = $ref_arg->{dba};
my $dbi = $ref_arg->{dbi};
$dbi = $self->dbi unless defined $dbi;
if((!defined $source_id) or (!defined $species_id) or (!defined $file) ){
croak "Need to pass source_id, species_id and file as pairs";
}
$verbose |=0;
my $peptide_source_id = $self->get_source_id_for_source_name('RefSeq_peptide', 'otherfeatures', $dbi);
my $mrna_source_id = $self->get_source_id_for_source_name('RefSeq_mRNA', 'otherfeatures', $dbi);
my $ncrna_source_id = $self->get_source_id_for_source_name('RefSeq_ncRNA', 'otherfeatures', $dbi);
my $pred_peptide_source_id =
$self->get_source_id_for_source_name('RefSeq_peptide_predicted', 'otherfeatures', $dbi);
my $pred_mrna_source_id =
$self->get_source_id_for_source_name('RefSeq_mRNA_predicted','otherfeatures', $dbi);
my $pred_ncrna_source_id =
$self->get_source_id_for_source_name('RefSeq_ncRNA_predicted', 'otherfeatures', $dbi);
if($verbose){
print "RefSeq_peptide source ID = $peptide_source_id\n";
print "RefSeq_mRNA source ID = $mrna_source_id\n";
print "RefSeq_ncRNA source ID = $ncrna_source_id\n";
print "RefSeq_peptide_predicted source ID = $pred_peptide_source_id\n";
print "RefSeq_mRNA_predicted source ID = $pred_mrna_source_id\n" ;
print "RefSeq_ncRNA_predicted source ID = $pred_ncrna_source_id\n" ;
}
my $user = "ensro";
my $host;
my $port = 3306;
my $dbname;
my $pass;
my $transcript_score_threshold = 0.75;
my $tl_transcript_score_threshold = 0.75;
my $project;
# Grep the project name, should be ensembl or ensemblgenomes
if($file =~ /project[=][>](\S+?)[,]/){
$project = $1;
}
# If specified, get core database connection details
if($file =~ /host[=][>](\S+?)[,]/){
$host = $1;
}
if($file =~ /port[=][>](\S+?)[,]/){
$port = $1;
}
if($file =~ /dbname[=][>](\S+?)[,]/){
$dbname = $1;
}
if($file =~ /pass[=][>](\S+?)[,]/){
$pass = $1;
}
if($file =~ /user[=][>](\S+?)[,]/){
$user = $1;
}
my $ofuser = 'ensro';
my $ofhost;
my $ofport = 3306;
my $ofdbname;
my $ofpass;
# If specified, get otherfeatures database connection details
if($file =~ /ofhost[=][>](\S+?)[,]/){
$ofhost = $1;
}
if($file =~ /ofuser[=][>](\S+?)[,]/){
$ofuser = $1;
}
if($file =~ /ofport[=][>](\S+?)[,]/){
$ofport = $1;
}
if($file =~ /ofdbname[=][>](\S+?)[,]/){
$ofdbname = $1;
}
if($file =~ /ofpass[=][>](\S+?)[,]/){
$ofpass = $1;
}
my $registry = "Bio::EnsEMBL::Registry";
#get the species name
my %id2name = $self->species_id2name($dbi);
if (defined $species_name) { push @{$id2name{$species_id}}, $species_name; }
if (!defined $id2name{$species_id}) { next; }
$species_name = $id2name{$species_id}[0];
my $core_dba;
my $otherf_dba;
if (defined $project && $project eq 'ensembl') {
# Can use user-defined database
if (defined $host) {
'-host' => $host,
'-user' => $user,
'-pass' => $pass,
'-dbname' => $dbname,
'-port' => $port,
'-species' => $species_name.$host,
'-group' => 'core',
);
} else {
# Else, database should be on staging
$registry->load_registry_from_multiple_dbs(
{
-host => 'mysql-ens-sta-1',
'-port' => 4519,
-user => 'ensro',
},
);
$core_dba = $registry->get_DBAdaptor($species_name,'core');
}
if (defined $ofhost) {
# Can use user-defined database
'-host' => $ofhost,
'-user' => $ofuser,
'-pass' => $ofpass,
'-port' => $ofport,
'-dbname' => $ofdbname,
'-species' => $species_name,
'-group' => 'otherfeatures',
);
$otherf_dba->
dnadb($core_dba);
} else {
# Else database should be on staging
$registry->load_registry_from_multiple_dbs(
{
-host => 'mysql-ens-sta-1',
'-port' => 4519,
-user => 'ensro',
},
);
$otherf_dba = $registry->get_DBAdaptor($species_name, 'otherfeatures') if !defined($ofhost);
if (defined $otherf_dba) { $otherf_dba->dnadb($core_dba); }
}
} elsif (defined $project && $project eq 'ensemblgenomes') {
$registry->load_registry_from_multiple_dbs(
{
-host => 'mysql-eg-staging-1.ebi.ac.uk',
-port => 4160,
-user => 'ensro',
},
{
-host => 'mysql-eg-staging-2.ebi.ac.uk',
-port => 4275,
-user => 'ensro',
},
);
$core_dba = $registry->get_DBAdaptor($species_name,'core');
$otherf_dba = $registry->get_DBAdaptor($species_name, 'otherfeatures');
} elsif (defined $db) {
$otherf_dba = $db;
$core_dba = $db->dnadb();
} else {
die("Missing or unsupported project value. Supported values: ensembl, ensemblgenomes");
}
## Not all species have an otherfeatures database, skip if not found
if (!$otherf_dba) {
print STDERR "No otherfeatures database for $species_name, skipping import for refseq_import data\n";
return;
}
## Add link to EntrezGene IDs where available
my (%entrez_ids) = %{ $self->get_valid_codes("EntrezGene", $species_id, $dbi) };
my $entrez_source_id = $self->get_source_id_for_source_name('EntrezGene', undef, $dbi);
my $add_dependent_xref_sth = $dbi->prepare("INSERT INTO dependent_xref (master_xref_id,dependent_xref_id, linkage_source_id) VALUES (?,?,?)");
## Add link to WikiGene IDs where available
my (%wiki_ids) = %{ $self->get_valid_codes('WikiGene', $species_id, $dbi) };
my $wiki_source_id = $self->get_source_id_for_source_name('WikiGene', undef, $dbi);
my $sa = $core_dba->get_SliceAdaptor();
my $sa_of = $otherf_dba->get_SliceAdaptor();
my $chromosomes_of = $sa_of->fetch_all('toplevel', undef, 1);
# Fetch analysis object for refseq
my $aa_of = $otherf_dba->get_AnalysisAdaptor();
my $logic_name;
foreach my $ana(@{ $aa_of->fetch_all() }) {
if ($ana->logic_name =~ /refseq_import/) {
$logic_name = $ana->logic_name;
}
}
## Not all species have refseq_import data, skip if not found
if (!defined $logic_name) {
print STDERR "No data found for RefSeq_import, skipping import\n";;
return;
}
foreach my $chromosome_of (@$chromosomes_of) {
my $chr_name = $chromosome_of->seq_region_name();
my $genes_of = $chromosome_of->get_all_Genes($logic_name, undef, 1);
while (my $gene_of = shift @$genes_of) {
my $transcripts_of = $gene_of->get_all_Transcripts();
# Create a range registry for all the exons of the refseq transcript
foreach my $transcript_of (sort { $a->start() <=> $b->start() } @$transcripts_of) {
my ($id, $tl_id);
# We're moving to RefSeq accessions being stored as xrefs rather than
# stable ids. But we also need to maintain backwards compatbility.
# If it's the new kind, where there's a display_xref use that,
# otherwise fall back to using the stable_id. But also check if we
# have neither, then skip the record.
if (defined $transcript_of->display_xref ) {
$id = $transcript_of->display_xref->display_id;
} elsif (defined $transcript_of->stable_id) {
$id = $transcript_of->stable_id;
} else {
# Skip non conventional accessions
next;
}
if ($id !~ /^[NXMR]{2}_[0-9]+/) { next; }
my %transcript_result;
my %tl_transcript_result;
if (!defined $id) { next; }
my $exons_of = $transcript_of->get_all_Exons();
my $tl_exons_of = $transcript_of->get_all_translateable_Exons();
foreach my $exon_of (@$exons_of) {
my $start_of = $exon_of->seq_region_start();
my $end_of = $exon_of->seq_region_end();
}
foreach my $tl_exon_of (@$tl_exons_of) {
my $tl_start_of = $tl_exon_of->seq_region_start();
my $tl_end_of = $tl_exon_of->seq_region_end();
$rr3->check_and_register( 'exon', $tl_start_of, $tl_end_of );
}
# Fetch slice in core database which overlaps refseq transcript
my $chromosome = $sa->fetch_by_region('toplevel', $chr_name, $transcript_of->seq_region_start, $transcript_of->seq_region_end);
my $transcripts = $chromosome->get_all_Transcripts(1);
# Create a range registry for all the exons of the ensembl transcript
foreach my $transcript(@$transcripts) {
if ($transcript->strand != $transcript_of->strand) { next; }
my $exons = $transcript->get_all_Exons();
my $exon_match = 0;
my $tl_exons = $transcript->get_all_translateable_Exons();
my $tl_exon_match = 0;
foreach my $exon (@$exons) {
my $start = $exon->seq_region_start();
my $end = $exon->seq_region_end();
$exon_match += $overlap/($end - $start + 1);
$rr2->check_and_register('exon', $start, $end);
}
foreach my $tl_exon (@$tl_exons) {
my $tl_start = $tl_exon->seq_region_start();
my $tl_end = $tl_exon->seq_region_end();
my $tl_overlap = $rr3->overlap_size('exon', $tl_start, $tl_end);
$tl_exon_match += $tl_overlap/($tl_end - $tl_start + 1);
$rr4->check_and_register('exon', $tl_start, $tl_end);
}
my $exon_match_of = 0;
my $tl_exon_match_of = 0;
# Look for oeverlap between the two sets of exons
foreach my $exon_of (@$exons_of) {
my $start_of = $exon_of->seq_region_start();
my $end_of = $exon_of->seq_region_end();
my $overlap_of = $rr2->overlap_size('exon', $start_of, $end_of);
$exon_match_of += $overlap_of/($end_of - $start_of + 1);
}
foreach my $tl_exon_of (@$tl_exons_of) {
my $tl_start_of = $tl_exon_of->seq_region_start();
my $tl_end_of = $tl_exon_of->seq_region_end();
my $tl_overlap_of = $rr4->overlap_size('exon', $tl_start_of, $tl_end_of);
$tl_exon_match_of += $tl_overlap_of/($tl_end_of - $tl_start_of + 1);
}
# Comparing exon matching with number of exons to give a score
my $score = ( ($exon_match_of + $exon_match)) / (scalar(@$exons_of) + scalar(@$exons) );
my $tl_score = 0;
if (scalar(@$tl_exons_of) > 0) {
$tl_score = ( ($tl_exon_match_of + $tl_exon_match)) / (scalar(@$tl_exons_of) + scalar(@$tl_exons) );
}
if ($transcript->biotype eq $transcript_of->biotype) {
$transcript_result{$transcript->stable_id} = $score;
$tl_transcript_result{$transcript->stable_id} = $tl_score;
} else {
$transcript_result{$transcript->stable_id} = $score * 0.90;
$tl_transcript_result{$transcript->stable_id} = $tl_score * 0.90;
}
}
my $best_score = 0;
my $best_tl_score = 0;
my $best_id;
my ($score, $tl_score);
# Comparing the scores based on coding exon overlap
# If there is a stale mate, chose best exon overlap score
foreach my $tid (sort { $transcript_result{$b} <=> $transcript_result{$a} } keys(%transcript_result)) {
$score = $transcript_result{$tid};
$tl_score = $tl_transcript_result{$tid};
if ($score > $transcript_score_threshold || $tl_score > $tl_transcript_score_threshold) {
if ($tl_score >= $best_tl_score) {
if ($tl_score > $best_tl_score) {
$best_id = $tid;
$best_score = $score;
$best_tl_score = $tl_score;
} elsif ($tl_score == $best_tl_score) {
if ($score > $best_score) {
$best_id = $tid;
$best_score = $score;
}
}
}
if (!defined $best_id) {
if ($score >= $best_score) {
$best_id = $tid;
$best_score = $score;
}
}
}
}
# If a best match was defined for the refseq transcript, store it as direct xref for ensembl transcript
if ($best_id) {
my ($acc, $version) = split(/\./, $id);
$version =~ s/\D
my $source_id;
$source_id = $mrna_source_id if $acc =~ /^NM_/;
$source_id = $ncrna_source_id if $acc =~ /^NR_/;
$source_id = $pred_mrna_source_id if $acc =~ /^XM_/;
$source_id = $pred_ncrna_source_id if $acc =~ /^XR_/;
# Accession should be of format NM_/XM_/NR_/XR_ otherwise it is not valid
if (!defined $source_id) { next; }
my $xref_id = $self->add_xref({ acc => $acc,
version => $version,
label => $id,
desc => undef,
source_id => $source_id,
species_id => $species_id,
dbi => $dbi,
info_type => 'DIRECT' });
$self->add_direct_xref($xref_id, $best_id, "Transcript", "", $dbi);
my $t_of = $transcript_of;
my $g_of = $t_of->get_Gene();
my $entrez_id = $g_of->stable_id();
my $tl_of = $t_of->translation();
my $ta = $core_dba->get_TranscriptAdaptor();
my $t = $ta->fetch_by_stable_id($best_id);
my $tl = $t->translation();
# Add link between Ensembl gene and EntrezGene
if (defined $entrez_ids{$entrez_id} ) {
foreach my $dependent_xref_id (@{$entrez_ids{$entrez_id}}) {
$add_dependent_xref_sth->execute($xref_id, $dependent_xref_id, $entrez_source_id);
}
foreach my $dependent_xref_id (@{$wiki_ids{$entrez_id}}) {
$add_dependent_xref_sth->execute($xref_id, $dependent_xref_id, $wiki_source_id);
}
}
# Also store refseq protein as direct xref for ensembl translation, if translation exists
if (defined $tl && defined $tl_of) {
if ($tl_of->seq eq $tl->seq) {
$tl_id = $tl_of->stable_id();
my @xrefs = grep {$_->{dbname} eq 'GenBank'} @{$tl_of->get_all_DBEntries};
if(scalar @xrefs == 1) {
$tl_id = $xrefs[0]->primary_id();
}
($acc, $version) = split(/\./, $tl_id);
$source_id = $peptide_source_id;
$source_id = $pred_peptide_source_id if $acc =~ /^XP_/;
my $tl_xref_id = $self->add_xref({ acc => $acc,
version => $version,
label => $tl_id,
desc => undef,
source_id => $source_id,
species_id => $species_id,
dbi => $dbi,
info_type => 'DIRECT' });
$self->add_direct_xref($tl_xref_id, $tl->stable_id(), "Translation", "", $dbi);
}
}
}
}
}
}
return 0;
}