3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
20 package XrefMapper::wormbase;
29 # This module is activated by specifying "taxon=wormbase" in the mapping input file
30 # It contains some common config for worms maintained by WormBase (i.e. having genes
31 # with WBGene ids etc)
35 my $default_method =
'ExonerateGappedBest1';
36 my %override_method_for_source = (
37 ExonerateGappedBest5_subtran => [
'RefSeq_mRNA',
38 'RefSeq_mRNA_predicted',
40 'RefSeq_ncRNA_predicted' ],
43 return $default_method, \%override_method_for_source;
48 sub set_display_xrefs {
56 print
"Building Transcript and Gene display_xrefs using WormBase direct xrefs\n" if ($self->verbose);
60 # - populate transcript display xref with wormbase_transcript
61 # - populate gene display xref with wormbase_locus
63 my (%external_dbs, %gene_display_xrefs, %tran_display_xrefs);
66 # Get external_db ids for the sources we are interested in
68 my $edb_sth = $self->core->dbc->prepare(
"SELECT external_db_id, db_name from external_db WHERE db_name like 'wormbase%'");
70 while( my ($edb_id, $edb_name) = $edb_sth->fetchrow_array ) {
71 $external_dbs{$edb_name} = $edb_id;
75 if (not exists $external_dbs{wormbase_transcript} or
76 not exists $external_dbs{wormbase_locus}) {
77 print
"Could not find wormbase_transcript and wormbase_locus in external_db table, so doing nothing\n" if $self->verbose;
81 my $query_gseq_sth = $self->core->dbc->prepare(
"SELECT ensembl_id, x.xref_id " .
82 "FROM object_xref ox, xref x ".
83 "WHERE ox.xref_id = x.xref_id AND external_db_id = " . $external_dbs{wormbase_gseqname});
84 $query_gseq_sth->execute();
85 while( my ($gid, $xid) = $query_gseq_sth->fetchrow_array) {
86 $gene_display_xrefs{$gid} = $xid;
88 $query_gseq_sth->finish;
93 # Some genes will have a locus name. Over-write display xrefs for those that do
95 my $query_gene_sth = $self->core->dbc->prepare(
"SELECT ensembl_id, x.xref_id " .
96 "FROM object_xref ox, xref x ".
97 "WHERE ox.xref_id = x.xref_id AND external_db_id = " . $external_dbs{wormbase_locus});
99 $query_gene_sth->execute();
100 while( my ($gid, $xid) = $query_gene_sth->fetchrow_array) {
101 $gene_display_xrefs{$gid} = $xid;
103 $query_gene_sth->finish;
107 # Get the wormbase_transcript xrefs for the genes
109 my $query_tran_sth = $self->core->dbc->prepare(
"SELECT ensembl_id, x.xref_id " .
110 "FROM object_xref ox, xref x " .
111 "WHERE ox.xref_id = x.xref_id and external_db_id = " . $external_dbs{wormbase_transcript});
112 $query_tran_sth->execute();
113 while( my ($gid, $xid) = $query_tran_sth->fetchrow_array) {
114 $tran_display_xrefs{$gid} = $xid;
116 $query_tran_sth->finish;
122 my $reset_sth = $self->core->dbc->prepare(
"UPDATE gene SET display_xref_id = null");
123 $reset_sth->execute();
126 $reset_sth = $self->core->dbc->prepare(
"UPDATE transcript SET display_xref_id = null");
127 $reset_sth->execute();
130 my $update_gene_sth = $self->core->dbc->prepare(
"UPDATE gene g SET g.display_xref_id= ? WHERE g.gene_id=?");
131 my $update_tran_sth = $self->core->dbc->prepare(
"UPDATE transcript t SET t.display_xref_id= ? WHERE t.transcript_id=?");
133 foreach my $gid (keys %gene_display_xrefs) {
134 $update_gene_sth->execute( $gene_display_xrefs{$gid}, $gid );
136 $update_gene_sth->finish;
138 foreach my $tid (keys %tran_display_xrefs) {
139 $update_tran_sth->execute( $tran_display_xrefs{$tid}, $tid );
141 $update_tran_sth->finish;
143 print
"Updated display xrefs in core for genes and transcripts\n" if $self->verbose;
147 # over-ride the following, to ensure that our carefully constructed transcript
148 # display ids are not stamped over by the default behaviour (propagation from
150 sub transcript_names_from_gene {
155 sub gene_description_sources {
171 sub gene_description_filter_regexps {
173 return (
'^(Protein \S+\s*)+$',
174 '^Uncharacterized protein\s*\S+\s*',
175 '^Uncharacterized protein\s*',
176 '^Putative uncharacterized protein\s*\S+\s*',
177 '^Putative uncharacterized protein\s*',
178 '^Hypothetical protein\s*\S+\s*',