ensembl-hive  2.8.1
store_ccds_xrefs.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 
3 # $Source: /cvsroot/ensembl/ensembl-personal/genebuilders/ccds/scripts/store_ccds_xrefs.pl,v $
4 # $Revision: 1.13 $
5 
6 =pod
7 
8 =head1 LICENSE
9 
10 See the NOTICE file distributed with this work for additional information
11 regarding copyright ownership.
12 
13 Licensed under the Apache License, Version 2.0 (the "License");
14 you may not use this file except in compliance with the License.
15 You may obtain a copy of the License at
16 
17  http://www.apache.org/licenses/LICENSE-2.0
18 
19 Unless required by applicable law or agreed to in writing, software
20 distributed under the License is distributed on an "AS IS" BASIS,
21 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22 See the License for the specific language governing permissions and
23 limitations under the License.
24 
25 =head1 NAME
26 
27  store_ccds_xrefs.pl
28 
29 =head1 SYNOPSIS
30 
31  Make CCDS Xrefs.
32 
33 =head1 DESCRIPTION
34 
35  Will store the Ensembl transcript stable_id that matches the ccds structure.
36  Originally written for Ian Longden. Based on make_enst_to_ccds.pl
37 
38 =head1 ARGUMENTS
39 
40  perl store_ccds_xrefs.pl
41  -ccds_dbname
42  -ccds_host
43  -ccds_port
44  -ccds_user
45  -ccds_pass
46  -dbname
47  -host
48  -port
49  -user
50  -verbose
51  -species
52  -path
53  -write
54  -delete_old
55 
56 =head1 EXAMPLE
57 
58  perl $ENSEMBL_PERSONAL/genebuilders/ccds/scripts/store_ccds_xrefs.pl -ccds_dbname db8_human_vega_61 \
59  -ccds_host genebuild7 -ccds_port 3306 -ccds_user user -ccds_pass password \
60  -dbname homo_sapiens_core_61_37f -host ens-staging1 -port 3306 -user ensro -verbose \
61  -species human -path GRCh37 -write -delete_old
62 
63 =cut
64 
65 
66 use warnings;
67 use strict;
68 
69 use Getopt::Long;
70 
72 use Bio::EnsEMBL::Utils::Exception qw(warning throw);
73 
74 # db of CCDS strcutures
75 my $ccds_host = '';
76 my $ccds_port = '3306';
77 my $ccds_user = 'ensro';
78 my $ccds_pass = undef;
79 my $ccds_dbname = '';
80 
81 # db of Ensembl (protein_coding) genes
82 my $host = 'ens-staging';
83 my $port = '';
84 my $user = 'ensro';
85 my $dbname = '';
86 
87 my $path = 'GRCh37';
88 my $species = 'human';
89 my $verbose;
90 my $write;
91 my $delete_old;
92 
93 &GetOptions( 'ccds_host=s' => \$ccds_host,
94  'ccds_port=s' => \$ccds_port,
95  'ccds_user=s' => \$ccds_user,
96  'ccds_pass=s' => \$ccds_pass,
97  'ccds_dbname=s' => \$ccds_dbname,
98  'host=s' => \$host,
99  'port=s' => \$port,
100  'user=s' => \$user,
101  'dbname=s' => \$dbname,
102  'path=s' => \$path,
103  'species=s' => \$species,
104  'verbose' => \$verbose,
105  'delete_old' => \$delete_old,
106  'write' => \$write, );
107 
108 if ( !defined $species ) {
109  throw("Please define species as human or mouse");
110 } else {
111  $species =~ s/\s//g;
112  if ( $species =~ /^human$/i || $species =~ /^mouse$/i ) {
113  # we're ok
114  print "Species is *$species*\n";
115  } else {
116  throw("Species must be defined as human or mouse");
117  }
118 }
119 
120 # we want to keep a record of any polymorphic pseudogenes for havana
121 # let's not write a file until the end though since they are not
122 # common
123 my @polymorphic_pseudogene;
124 
125 
126 # connect to dbs
127 my $db =
128  new Bio::EnsEMBL::DBSQL::DBAdaptor( -host => $host,
129  -user => $user,
130  -port => $port,
131  -dbname => $dbname );
132 
133 my $ccds_db =
134  new Bio::EnsEMBL::DBSQL::DBAdaptor( -host => $ccds_host,
135  -user => $ccds_user,
136  -pass => $ccds_pass,
137  -port => $ccds_port,
138  -dbname => $ccds_dbname );
139 $ccds_db->dnadb($db);
140 my $ccds_sa = $ccds_db->get_SliceAdaptor;
141 my $outdea = $ccds_db->get_DBEntryAdaptor;
142 my $sa = $db->get_SliceAdaptor;
143 
144 ###
145 # delete old ones if delete_old set
146 ###
147 if($write and $delete_old){
148  my $sth = $outdea->prepare('delete ox from xref x, object_xref ox, external_db e where x.xref_id = ox.xref_id and x.external_db_id = e.external_db_id and e.db_name like "Ens_%"');
149 
150  $sth->execute || die "Could not delete old object_xrefs";
151 
152  $sth = $outdea->prepare('delete x from xref x, external_db e where x.external_db_id = e.external_db_id and e.db_name like "Ens_%"');
153 
154  $sth->execute || die "Could not delete ols xrefs";
155 }
156 
157 # # #
158 # Loop thru toplevels
159 # # #
160 # maybe should use toplevel instead of chromosome?
161 foreach my $chr ( @{ $ccds_sa->fetch_all('chromosome') } ) {
162  print "Doing chromosome " . $chr->name . "\n" if ($verbose);
163 
164  # fetch all CCDS structures on slice
165  foreach my $ccds_gene ( @{ $chr->get_all_Genes( undef, undef, 1 ) } ) {
166 
167  # make sure genes are al on chr level
168  $ccds_gene = $ccds_gene->transform( 'chromosome', $path );
169 
170  # loop thru all CCDS transcripts
171  foreach my $ccds_trans ( @{ $ccds_gene->get_all_Transcripts() } ) {
172  print "=> doing ccds trans "
173  . $ccds_trans->dbID
174  . ": start "
175  . $ccds_trans->start
176  . " stop "
177  . $ccds_trans->end
178  . " strand "
179  . $ccds_trans->strand . " \n"
180  if ($verbose);
181 
182  # find the ccds_id
183  my $ccds_id;
184  my @db_entries = @{ $ccds_trans->get_all_DBEntries('CCDS') };
185  my %xref_hash;
186 
187  foreach my $dbe (@db_entries) {
188  print "dbe " . $dbe->display_id . " " . $dbe->dbname . "\n";
189  }
190 
191  # store unique CCDS xrefs for the transcript
192  foreach my $entry (@db_entries) {
193  $xref_hash{ $entry->display_id() } = 1;
194  }
195 
196  # we should not have more than one CCDS id
197  # associated with a transcript
198  if ( scalar keys %xref_hash != 1 ) {
199  foreach my $entry ( keys %xref_hash ) {
200  print " Dodgy xref : " . $entry . "\n";
201  }
202  throw( "Something odd going on: Transcript dbID "
203  . $ccds_trans->dbID . " has "
204  . scalar( keys %xref_hash )
205  . " xrefs" );
206  } else {
207  # all is good; CCDS transcript only has 1 CCDS xref
208  foreach my $entry ( keys %xref_hash ) {
209  $ccds_id = $entry;
210  print "=> on ccds $ccds_id\n" if ($verbose);
211  }
212  }
213 
214  # define the genomic location that we're working in
215  # ie. where the CCDS transcript is
216  my $chr_name = $ccds_trans->slice->seq_region_name;
217  my $start = $ccds_trans->start();
218  my $end = $ccds_trans->end();
219 
220  # now fetch the slice out of ensembl db
221  my $slice =
222  $sa->fetch_by_region( 'chromosome', $chr_name, $start, $end, '1',
223  $path );
224  print " Ensembl slice name " . $slice->name . "\n" if ($verbose);
225 
226  # get ccds coding exons
227  my @ccds_exons = @{ $ccds_trans->get_all_translateable_Exons() };
228  print " have " . @ccds_exons . " ccds coding exons\n" if ($verbose);
229 
230  # get all Ensembl genes overlapping the CCDS regions
231  foreach my $gene ( @{ $slice->get_all_Genes( undef, undef, 1 ) } ) {
232 
233  # only look at protein_coding genes
234  next unless ( $gene->biotype =~ /protein_coding/ || $gene->biotype =~ /polymorphic_pseudogene/);
235 
236  # debug
237 # next if $gene->biotype =~ /protein_coding/ ;
238 
239  # keep a record if it is a polymorphic pseudogene - these will need to be sent to havana
240  if ($gene->biotype =~ /polymorphic_pseudogene/) {
241  print STDERR " found a poly pseudo gene\n" if ($verbose);
242  push @polymorphic_pseudogene, $ccds_id;
243  }
244 
245  # make sure ensembl gene also on chr level
246  print " on ensembl gene " . $gene->display_id . "\n" if ($verbose);
247  $gene = $gene->transform( 'chromosome', $path );
248 
249  # loop thru ensembl transcripts
250  foreach my $trans ( @{ $gene->get_all_Transcripts } ) {
251  print " on ensembl trans " . $trans->display_id . "\n"
252  if ($verbose);
253 
254  # get ensembl coding exons
255  my @exons = @{ $trans->get_all_translateable_Exons() };
256  print " have " . @exons . " ensembl coding exons\n" if ($verbose);
257 
258  # loop thru ensembl coding exons and make sure they all match the ccds
259  # exons exactly
260  my $match = 0;
261 
262  if ( scalar @exons == scalar @ccds_exons ) {
263  for ( my $i = 0 ; $i < scalar(@exons) ; $i++ ) {
264  # print " Ensembl start ".$exons[$i]->start." end ".$exons[$i]->end.
265  # " CCDS start ".$ccds_exons[$i]->start." end ".$ccds_exons[$i]->end."\n";
266 
267  if ( $ccds_exons[$i]->start == $exons[$i]->start
268  && $ccds_exons[$i]->end == $exons[$i]->end
269  && $ccds_exons[$i]->strand == $exons[$i]->strand )
270  {
271 
272  $match++;
273  } #else {
274  # print "no match ".$ccds_exons[$i]->start." != ".$exons[$i]->start." or ".
275  # $ccds_exons[$i]->end." != ".$exons[$i]->end."\n";
276  #}
277  }
278 
279  if ( $match == scalar @exons ) {
280  print "MATCH\t" . $trans->stable_id . "\t" . $ccds_id . "\n"
281  if ($verbose);
282  store_ensembl_xref( $outdea, $species, $ccds_trans,
283  $trans->stable_id, $write );
284  store_ensembl_xref( $outdea,
285  $species,
286  $ccds_trans->translation,
287  $trans->translation->stable_id,
288  $write );
289  } else {
290  print " no match ($match)\t"
291  . $trans->stable_id . "\t"
292  . $ccds_id . "\n"
293  if ($verbose);
294  }
295  } ## end if ( scalar @exons == ...)
296  } ## end foreach my $trans ( @{ $gene...})
297  } ## end foreach my $gene ( @{ $slice...})
298  } ## end foreach my $ccds_trans ( @{...})
299  } ## end foreach my $ccds_gene ( @{ ...})
300 } ## end foreach my $chr ( @{ $ccds_sa...})
301 
302 # report polymorphic pseudogenes
303 if (@polymorphic_pseudogene) {
304  for my $display_id (@polymorphic_pseudogene) {
305  print STDERR $display_id." matches a polymorphic pseudogene\n";
306  }
307 } else {
308  print STDERR "Found 0 polymorphic pseudogenes\n";
309 }
310 
311 
312 sub store_ensembl_xref {
313  my ( $dbea, $species, $ccds_trans, $ensembl_trans_stable_id, $write ) = @_;
314 
315  if ( ref($ccds_trans) eq "Bio::EnsEMBL::Transcript" ) {
316 
317  my $external_db;
318  if ( $species =~ /^human$/i ) {
319  $external_db = 'Ens_Hs_transcript';
320  } elsif ( $species =~ /^mouse$/i ) {
321  $external_db = 'Ens_Mm_transcript';
322  }
323  # make an xref
324 
325  my $entry =
326  new Bio::EnsEMBL::DBEntry( -adaptor => $dbea,
327  -primary_id => $ensembl_trans_stable_id,
328  -display_id => $ensembl_trans_stable_id,
329  -version => 0,
330  -dbname => $external_db);
331  # store xref
332  $dbea->store( $entry, $ccds_trans->dbID, 'Transcript' ) if ($write);
333 
334  } elsif ( ref($ccds_trans) eq "Bio::EnsEMBL::Translation" ) {
335  my $external_db;
336  if ( $species =~ /^human$/i ) {
337  $external_db = 'Ens_Hs_translation';
338  } elsif ( $species =~ /^mouse$/i ) {
339  $external_db = 'Ens_Mm_translation';
340  }
341  # make an xref
342  my $entry =
343  new Bio::EnsEMBL::DBEntry( -adaptor => $dbea,
344  -primary_id => $ensembl_trans_stable_id,
345  -display_id => $ensembl_trans_stable_id,
346  -version => 0,
347  -dbname => $external_db);
348  # store xref
349  $dbea->store( $entry, $ccds_trans->dbID, 'Translation' ) if ($write);
350 
351  } else {
352  throw("Not a Transcript or Translation ");
353  }
354 
355  return;
356 } ## end sub store_ensembl_xref
transcript
public transcript()
store_ensembl_xref
public store_ensembl_xref()
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
Bio::EnsEMBL::DBEntry
Definition: DBEntry.pm:12
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68