ensembl-hive  2.8.1
generate_stable_ids.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 # Generate stable IDs for genes/transcripts/translations/exons that have none
19 # Start from current max stable ID + 1
20 
21 use strict;
22 use warnings;
23 
24 use DBI;
25 use Getopt::Long;
26 
27 my $help = 0;
28 my $port = 3306;
29 my ( $host, $dbname, $user, $pass, @types, $start, $verbose );
30 
31 if (
32  !GetOptions(
33  'dbuser|user=s' => \$user,
34  'dbpass|pass=s' => \$pass,
35  'dbhost|host=s' => \$host,
36  'dbport|port=i' => \$port,
37  'dbname=s' => \$dbname,
38  'types=s' => \@types,
39  'start=s' => \$start,
40  # USE ENS000001 or ENS for human, ENSMUS00001 or ENSMUS for mouse etc.
41  # Don't add G/T/E/P for specific types !!!
42  'help!' => \$help,
43  'verbose!' => \$verbose, )
44  || ( $help
45  || !defined($user)
46  || !defined($host)
47  || !defined($dbname) ) )
48 {
49  usage();
50  exit;
51 }
52 
53 if ( !@types ) {
54  @types = ( 'gene', 'transcript', 'translation', 'exon' );
55 }
56 @types = split( /,/, join( ',', @types ) );
57 
58 my $dbi =
59  DBI->connect( "DBI:mysql:host=$host:port=$port;database=$dbname",
60  $user, $pass, { 'RaiseError' => 1 } )
61  || die "Can't connect to database\n";
62 
63 foreach my $type (@types) {
64  my $sth;
65 
66  # Get starting stable ID, either specified or current max.
67 
68  my $new_stable_id;
69  if ( defined($start) ) {
70  $new_stable_id = $start;
71  } else {
72  $new_stable_id = get_highest_stable_id( $dbi, $type );
73  }
74 
75  if ($verbose) {
76  print("Highest, pruned $type\_stable_id found : $new_stable_id \n");
77  }
78 
79  # Get timestamp so all new stable IDs have the same created/modified
80  # dates.
81  $sth = $dbi->prepare("SELECT NOW()");
82  $sth->execute();
83  my $ts;
84  if ( my @row = $sth->fetchrow_array() ) {
85  $ts = $row[0];
86  } else {
87  die("Can't get timestamp\n");
88  }
89  $sth->finish();
90 
91  # get a list of objects that don't currently have stable IDs assigned
92  # and assign new ones, incrementing & preserving formatting as we go
93  my $sql =
94  "SELECT ${type}_id "
95  . "FROM $type "
96  . "WHERE stable_id IS NULL";
97  $sth = $dbi->prepare($sql);
98  $sth->execute();
99 
100  while ( my @row = $sth->fetchrow_array() ) {
101  ( $new_stable_id, my $nis ) =
102  @{ increment_stable_id( $new_stable_id, $type ) };
103  print( "UPDATE $type SET stable_id = \'$nis\', version = 1, created_date = \'$ts\', modified_date = \'$ts\'"
104  . " WHERE ${type}_id = $row[0];\n" );
105  }
106 } ## end foreach my $type (@types)
107 
108 #-------------------------------------------------------------------------------
109 
111  my ( $stable_id, $type ) = @_;
112 
113  my ( $prefix, $suffix );
114 
115  # Check stable_id format ...
116  if ( $stable_id =~ m/([a-zA-Z]+)([0-9]+)/ ) {
117  ( $prefix, $suffix ) = $stable_id =~ /([a-zA-Z]+)([0-9]+)/;
118  } elsif ( $stable_id =~ m/([a-zA-Z]+)/ ) {
119  $prefix = $stable_id;
120  } else {
121  die( "unrecognized stable_id format: $stable_id "
122  . "- should match ([a-zA-Z]+)([0-9]+) or ([a-zA-Z]+) !!\n" );
123  }
124 
125  my $new_sid;
126  if ( $type eq 'gene' ) {
127  $new_sid = $prefix . 'G';
128  } elsif ( $type eq 'transcript' ) {
129  $new_sid = $prefix . 'T';
130  } elsif ( $type eq 'translation' ) {
131  $new_sid = $prefix . 'P';
132  } elsif ( $type eq 'exon' ) {
133  $new_sid = $prefix . 'E';
134  }
135  my $new_stable_id = sprintf( "%s%011d", $new_sid, $suffix + 1 );
136 
137  my $old = sprintf( "%s%011d", $prefix, $suffix + 1 );
138 
139  return [ $old, $new_stable_id ];
140 } ## end sub increment_stable_id
141 
142 #-------------------------------------------------------------------------------
143 
145  my ( $dbi, $type ) = @_;
146 
147  # Try to get from relevant archive.
148  my $sth = $dbi->prepare("SELECT MAX($type) FROM gene_archive WHERE $type LIKE 'ENS%'");
149  $sth->execute();
150 
151  my $rs;
152  if ( my @row = $sth->fetchrow_array ) {
153  $rs = $row[0];
154  }
155 
156  if ( length($rs) <= 0 ) {
157  print( STDERR "no entry for $type found in gene_archive table "
158  . "- returning undef\n" );
159  return undef;
160  }
161 
162  return $rs;
163 }
164 
165 #-------------------------------------------------------------------------------
166 
168  my ( $dbi, $type ) = @_;
169 
170  my ( $highest_from_current, $highest_from_archive );
171 
172  # Get highest stable ID from the relevant table.
173 
174  my $sth = $dbi->prepare("SELECT MAX(stable_id) FROM $type WHERE stable_id LIKE 'ENS%'");
175  $sth->execute();
176 
177  if ( my @row = $sth->fetchrow_array() ) {
178  $highest_from_current = $row[0];
179  } else {
180  die("Can't get max $type stable ID from $type\n");
181  }
182 
183  if ( length($highest_from_current) == 0 ) {
184  print( STDERR
185  " Warning ! length of stable_id for $type is zero \n" );
186  }
187 
188  if ( $type eq "exon" ) {
189  # Archive doesn't store information about exon_stable_ids so try
190  # without archive first.
191 
192  if ( length($highest_from_current) == 0 ) {
193  print( "\n"
194  . "WARNING:\n"
195  . "No stable_id for exon found \n"
196  . "I got no prefix to generate new stable_ids for type $type!!! "
197  . "- I'll try to use gene_archive now\n" );
198 
199  my $max =
200  get_max_stable_id_from_gene_archive( $dbi, "gene_stable_id" );
201 
202  my $prefix;
203 
204  if ( length($max) > 0 ) {
205  ( $prefix, my $suffix ) = $max =~ /([a-zA-Z]+)([0-9]+)/;
206  $prefix =~ s/G$//g;
207  } else {
208  die( "ERROR: "
209  . "No entries in table exon and "
210  . "gene_archive tables found\n"
211  . "Don't know which species prefix to use for species.\n" );
212 
213  $highest_from_current = sprintf( "%s%011d", $prefix, 0 );
214  }
215  } ## end if ( length($highest_from_current...))
216 
217  # remove the 'E' from exon stable id prefix
218  my ( $prefix, $suffix ) = $highest_from_current =~ /([a-zA-Z]+)([0-9]+)/;
219  $prefix =~ s/E$//;
220 
221  return $prefix . $suffix;
222  } ## end if ( $type eq "exon" )
223 
224  # and from relevant archive
225 
226  $highest_from_archive =
227  get_max_stable_id_from_gene_archive( $dbi, $type . "_stable_id" );
228 
229  my $max =
230  ( $highest_from_current ge $highest_from_archive )
231  ? $highest_from_current
232  : $highest_from_archive;
233 
234  if ( length($max) == 0 ) {
235  die( "ERROR: "
236  . "No stable_id in table gene_archive "
237  . "or found in $type\_stable_id - tables\n" );
238  }
239 
240  # Assuming that this is a correctly formatted stable id -> remove the
241  # G/T/P for gene etc. (Exon dealt with above.)
242 
243  my ( $prefix, $suffix ) = $max =~ /([a-zA-Z]+)([0-9]+)/;
244  if ( $type eq 'gene' ) {
245  $prefix =~ s/G$//;
246  } elsif ( $type eq 'transcript' ) {
247  $prefix =~ s/T$//;
248  } elsif ( $type eq 'translation' ) {
249  $prefix =~ s/P$//;
250  }
251 
252  return $prefix . $suffix;
253 } ## end sub get_highest_stable_id
254 
255 #-------------------------------------------------------------------------------
256 
257 sub usage {
258  print <<USAGE_END;
259 
260  USAGE:
261 
262  generate_stable_ids.pl -dbuser|user {user}
263  -dbpass|pass {password}
264  -dbhost|host {host}
265  -dbport|port {port}
266  -dbname {database}
267  -types {gene,exon,transcript,translation}
268  -start {first stable ID}
269 
270  Argument to -types is a comma-separated list of types of stable IDs to
271  be produced.
272 
273  If the -types argument is ommitted, stable IDs are generated for all
274  types (gene,transcript,translation,exon).
275 
276  Assigns stable IDs to objects that currently have none. The starting
277  stable ID is found by incrementing the highest current stable ID for
278  that type *or* by using -start argument. If no -start option is used
279  the script tries to find the latest given stable_id for each object
280  by looking up the <OBJ>_stable_id tables in the database and the
281  gene_archive table (only for gene, translation and transcript, not for
282  exon stable IDs!)
283 
284  Please note that this script only works for "ENS..."-type stable ids.
285  It does not work for LRG ids or other types of stable ids.
286 
287  Note:
288 
289  The -start option requires to not submit an initial stable ID without
290  any gene, transcript, translation or exon specifier, as in
291 
292  ENSMUS000001 (not ENSMUSG0001, then you end up with stable IDs like
293  ENSMUSGG001, ENSMUSGT0001.)
294 
295  Again, the parameter to -start should be the stable ID you wish
296  to start from without the gene, transcript, translation or exon
297  specifier.
298 
299  Examples:
300 
301  - to generate only exon stable IDs starting with 223 for mouse, use
302 
303  -start ENSMUS222 -types exon
304 
305  - to generate exon and gene stable IDs, starting with 223 for mouse,
306  use
307 
308  -start ENSMUS222 -types exon,gene
309 
310  - to generate all types of stable IDs for human, which all start
311  with ID 666 use
312 
313  -start ENS665
314 
315  - to generate a whole new set of stable_ids (exon, transcript,
316  translation, gene) starting with 1 for an organism with prefix
317  ENSNEW you can use one of the following options :
318 
319  -start ENSNEW0 <or>
320  -start ENSNEW0 <or>
321  -start ENSNEW00000000
322 
323  The script produces SQL which can be run against the target database.
324 
325 USAGE_END
326 } ## end sub usage
transcript
public transcript()
usage
public usage()
increment_stable_id
public increment_stable_id()
exon
public exon()
run
public run()
get_max_stable_id_from_gene_archive
public get_max_stable_id_from_gene_archive()
get_highest_stable_id
public get_highest_stable_id()