3 # $Source: /cvsroot/ensembl/ensembl-personal/genebuilders/ccds/scripts/store_ccds_xrefs.pl,v $
10 See the NOTICE file distributed with
this work
for additional information
11 regarding copyright ownership.
13 Licensed under the Apache License, Version 2.0 (the
"License");
14 you may not use
this file except in compliance with the License.
15 You may obtain a copy of the License at
19 Unless required by applicable law or agreed to in writing, software
20 distributed under the License is distributed on an
"AS IS" BASIS,
21 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22 See the License
for the specific language governing permissions and
23 limitations under the License.
35 Will store the Ensembl
transcript stable_id that matches the ccds structure.
36 Originally written
for Ian Longden. Based on make_enst_to_ccds.pl
40 perl store_ccds_xrefs.pl
58 perl $ENSEMBL_PERSONAL/genebuilders/ccds/scripts/store_ccds_xrefs.pl -ccds_dbname db8_human_vega_61 \
59 -ccds_host genebuild7 -ccds_port 3306 -ccds_user user -ccds_pass password \
60 -dbname homo_sapiens_core_61_37f -host ens-staging1 -port 3306 -user ensro -verbose \
61 -species human -path GRCh37 -write -delete_old
74 # db of CCDS strcutures
76 my $ccds_port =
'3306';
77 my $ccds_user =
'ensro';
78 my $ccds_pass = undef;
81 # db of Ensembl (protein_coding) genes
82 my $host =
'ens-staging';
88 my $species =
'human';
93 &GetOptions(
'ccds_host=s' => \$ccds_host,
94 'ccds_port=s' => \$ccds_port,
95 'ccds_user=s' => \$ccds_user,
96 'ccds_pass=s' => \$ccds_pass,
97 'ccds_dbname=s' => \$ccds_dbname,
101 'dbname=s' => \$dbname,
103 'species=s' => \$species,
104 'verbose' => \$verbose,
105 'delete_old' => \$delete_old,
106 'write' => \$write, );
108 if ( !defined $species ) {
109 throw(
"Please define species as human or mouse");
112 if ( $species =~ /^human$/i || $species =~ /^mouse$/i ) {
114 print
"Species is *$species*\n";
116 throw(
"Species must be defined as human or mouse");
120 # we want to keep a record of any polymorphic pseudogenes for havana
121 # let's not write a file until the end though since they are not
123 my @polymorphic_pseudogene;
131 -dbname => $dbname );
138 -dbname => $ccds_dbname );
139 $ccds_db->dnadb($db);
140 my $ccds_sa = $ccds_db->get_SliceAdaptor;
141 my $outdea = $ccds_db->get_DBEntryAdaptor;
142 my $sa = $db->get_SliceAdaptor;
145 # delete old ones if delete_old set
147 if($write and $delete_old){
148 my $sth = $outdea->prepare(
'delete ox from xref x, object_xref ox, external_db e where x.xref_id = ox.xref_id and x.external_db_id = e.external_db_id and e.db_name like "Ens_%"');
150 $sth->execute || die
"Could not delete old object_xrefs";
152 $sth = $outdea->prepare(
'delete x from xref x, external_db e where x.external_db_id = e.external_db_id and e.db_name like "Ens_%"');
154 $sth->execute || die
"Could not delete ols xrefs";
158 # Loop thru toplevels
160 # maybe should use toplevel instead of chromosome?
161 foreach my $chr ( @{ $ccds_sa->fetch_all(
'chromosome') } ) {
162 print
"Doing chromosome " . $chr->name .
"\n" if ($verbose);
164 # fetch all CCDS structures on slice
165 foreach my $ccds_gene ( @{ $chr->get_all_Genes( undef, undef, 1 ) } ) {
167 # make sure genes are al on chr level
168 $ccds_gene = $ccds_gene->transform(
'chromosome', $path );
170 # loop thru all CCDS transcripts
171 foreach my $ccds_trans ( @{ $ccds_gene->get_all_Transcripts() } ) {
172 print
"=> doing ccds trans "
179 . $ccds_trans->strand .
" \n"
184 my @db_entries = @{ $ccds_trans->get_all_DBEntries(
'CCDS') };
187 foreach my $dbe (@db_entries) {
188 print
"dbe " . $dbe->display_id .
" " . $dbe->dbname .
"\n";
191 # store unique CCDS xrefs for the transcript
192 foreach my $entry (@db_entries) {
193 $xref_hash{ $entry->display_id() } = 1;
196 # we should not have more than one CCDS id
197 # associated with a transcript
198 if ( scalar keys %xref_hash != 1 ) {
199 foreach my $entry ( keys %xref_hash ) {
200 print
" Dodgy xref : " . $entry .
"\n";
202 throw(
"Something odd going on: Transcript dbID "
203 . $ccds_trans->dbID .
" has "
204 . scalar( keys %xref_hash )
207 # all is good; CCDS transcript only has 1 CCDS xref
208 foreach my $entry ( keys %xref_hash ) {
210 print
"=> on ccds $ccds_id\n" if ($verbose);
214 # define the genomic location that we're working in
215 # ie. where the CCDS transcript is
216 my $chr_name = $ccds_trans->slice->seq_region_name;
217 my $start = $ccds_trans->start();
218 my $end = $ccds_trans->end();
220 # now fetch the slice out of ensembl db
222 $sa->fetch_by_region(
'chromosome', $chr_name, $start, $end,
'1',
224 print
" Ensembl slice name " . $slice->name .
"\n" if ($verbose);
226 # get ccds coding exons
227 my @ccds_exons = @{ $ccds_trans->get_all_translateable_Exons() };
228 print
" have " . @ccds_exons .
" ccds coding exons\n" if ($verbose);
230 # get all Ensembl genes overlapping the CCDS regions
231 foreach my $gene ( @{ $slice->get_all_Genes( undef, undef, 1 ) } ) {
233 # only look at protein_coding genes
234 next unless ( $gene->biotype =~ /protein_coding/ || $gene->biotype =~ /polymorphic_pseudogene/);
237 # next if $gene->biotype =~ /protein_coding/ ;
239 # keep a record if it is a polymorphic pseudogene - these will need to be sent to havana
240 if ($gene->biotype =~ /polymorphic_pseudogene/) {
241 print STDERR
" found a poly pseudo gene\n" if ($verbose);
242 push @polymorphic_pseudogene, $ccds_id;
245 # make sure ensembl gene also on chr level
246 print
" on ensembl gene " . $gene->display_id .
"\n" if ($verbose);
247 $gene = $gene->transform(
'chromosome', $path );
249 # loop thru ensembl transcripts
250 foreach my $trans ( @{ $gene->get_all_Transcripts } ) {
251 print
" on ensembl trans " . $trans->display_id .
"\n"
254 # get ensembl coding exons
255 my @exons = @{ $trans->get_all_translateable_Exons() };
256 print
" have " . @exons .
" ensembl coding exons\n" if ($verbose);
258 # loop thru ensembl coding exons and make sure they all match the ccds
262 if ( scalar @exons == scalar @ccds_exons ) {
263 for ( my $i = 0 ; $i < scalar(@exons) ; $i++ ) {
264 # print " Ensembl start ".$exons[$i]->start." end ".$exons[$i]->end.
265 # " CCDS start ".$ccds_exons[$i]->start." end ".$ccds_exons[$i]->end."\n";
267 if ( $ccds_exons[$i]->start == $exons[$i]->start
268 && $ccds_exons[$i]->end == $exons[$i]->end
269 && $ccds_exons[$i]->strand == $exons[$i]->strand )
274 # print "no match ".$ccds_exons[$i]->start." != ".$exons[$i]->start." or ".
275 # $ccds_exons[$i]->end." != ".$exons[$i]->end."\n";
279 if ( $match == scalar @exons ) {
280 print
"MATCH\t" . $trans->stable_id .
"\t" . $ccds_id .
"\n"
283 $trans->stable_id, $write );
286 $ccds_trans->translation,
287 $trans->translation->stable_id,
290 print
" no match ($match)\t"
291 . $trans->stable_id .
"\t"
295 } ## end
if ( scalar @exons == ...)
296 } ## end
foreach my $trans ( @{ $gene...})
297 } ## end
foreach my $gene ( @{ $slice...})
298 } ## end
foreach my $ccds_trans ( @{...})
299 } ## end
foreach my $ccds_gene ( @{ ...})
300 } ## end
foreach my $chr ( @{ $ccds_sa...})
302 # report polymorphic pseudogenes
303 if (@polymorphic_pseudogene) {
304 for my $display_id (@polymorphic_pseudogene) {
305 print STDERR $display_id.
" matches a polymorphic pseudogene\n";
308 print STDERR
"Found 0 polymorphic pseudogenes\n";
313 my ( $dbea, $species, $ccds_trans, $ensembl_trans_stable_id, $write ) = @_;
315 if ( ref($ccds_trans) eq
"Bio::EnsEMBL::Transcript" ) {
318 if ( $species =~ /^human$/i ) {
319 $external_db =
'Ens_Hs_transcript';
320 } elsif ( $species =~ /^mouse$/i ) {
321 $external_db =
'Ens_Mm_transcript';
327 -primary_id => $ensembl_trans_stable_id,
328 -display_id => $ensembl_trans_stable_id,
330 -dbname => $external_db);
332 $dbea->store( $entry, $ccds_trans->dbID,
'Transcript' )
if ($write);
334 } elsif ( ref($ccds_trans) eq
"Bio::EnsEMBL::Translation" ) {
336 if ( $species =~ /^human$/i ) {
337 $external_db =
'Ens_Hs_translation';
338 } elsif ( $species =~ /^mouse$/i ) {
339 $external_db =
'Ens_Mm_translation';
344 -primary_id => $ensembl_trans_stable_id,
345 -display_id => $ensembl_trans_stable_id,
347 -dbname => $external_db);
349 $dbea->store( $entry, $ccds_trans->dbID,
'Translation' )
if ($write);
352 throw(
"Not a Transcript or Translation ");