2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
18 # Generate stable IDs for genes/transcripts/translations/exons that have none
19 # Start from current max stable ID + 1
29 my ( $host, $dbname, $user, $pass, @types, $start, $verbose );
33 'dbuser|user=s' => \$user,
34 'dbpass|pass=s' => \$pass,
35 'dbhost|host=s' => \$host,
36 'dbport|port=i' => \$port,
37 'dbname=s' => \$dbname,
40 # USE ENS000001 or ENS
for human, ENSMUS00001 or ENSMUS
for mouse etc.
41 # Don
't add G/T/E/P for specific types !!!
43 'verbose!
' => \$verbose, )
47 || !defined($dbname) ) )
56 @types = split( /,/, join( ',
', @types ) );
59 DBI->connect( "DBI:mysql:host=$host:port=$port;database=$dbname",
60 $user, $pass, { 'RaiseError
' => 1 } )
61 || die "Can't connect to database\n
";
63 foreach my $type (@types) {
66 # Get starting stable ID, either specified or current max.
69 if ( defined($start) ) {
70 $new_stable_id = $start;
72 $new_stable_id = get_highest_stable_id( $dbi, $type );
76 print("Highest, pruned $type\_stable_id found : $new_stable_id \n
");
79 # Get timestamp so all new stable IDs have the same created/modified
81 $sth = $dbi->prepare("SELECT NOW()
");
84 if ( my @row = $sth->fetchrow_array() ) {
87 die("Can
't get timestamp\n");
91 # get a list of objects that don't currently have stable IDs assigned
92 # and assign
new ones, incrementing & preserving formatting as we go
96 .
"WHERE stable_id IS NULL";
97 $sth = $dbi->prepare($sql);
100 while ( my @row = $sth->fetchrow_array() ) {
101 ( $new_stable_id, my $nis ) =
102 @{ increment_stable_id( $new_stable_id, $type ) };
103 print(
"UPDATE $type SET stable_id = \'$nis\', version = 1, created_date = \'$ts\', modified_date = \'$ts\'"
104 .
" WHERE ${type}_id = $row[0];\n" );
106 } ## end
foreach my $type (@types)
108 #-------------------------------------------------------------------------------
111 my ( $stable_id, $type ) = @_;
113 my ( $prefix, $suffix );
115 # Check stable_id format ...
116 if ( $stable_id =~ m/([a-zA-Z]+)([0-9]+)/ ) {
117 ( $prefix, $suffix ) = $stable_id =~ /([a-zA-Z]+)([0-9]+)/;
118 } elsif ( $stable_id =~ m/([a-zA-Z]+)/ ) {
119 $prefix = $stable_id;
121 die(
"unrecognized stable_id format: $stable_id "
122 .
"- should match ([a-zA-Z]+)([0-9]+) or ([a-zA-Z]+) !!\n" );
126 if ( $type eq
'gene' ) {
127 $new_sid = $prefix .
'G';
128 } elsif ( $type eq
'transcript' ) {
129 $new_sid = $prefix .
'T';
130 } elsif ( $type eq
'translation' ) {
131 $new_sid = $prefix .
'P';
132 } elsif ( $type eq
'exon' ) {
133 $new_sid = $prefix .
'E';
135 my $new_stable_id = sprintf(
"%s%011d", $new_sid, $suffix + 1 );
137 my $old = sprintf(
"%s%011d", $prefix, $suffix + 1 );
139 return [ $old, $new_stable_id ];
142 #-------------------------------------------------------------------------------
145 my ( $dbi, $type ) = @_;
147 # Try to get from relevant archive.
148 my $sth = $dbi->prepare(
"SELECT MAX($type) FROM gene_archive WHERE $type LIKE 'ENS%'");
152 if ( my @row = $sth->fetchrow_array ) {
156 if ( length($rs) <= 0 ) {
157 print( STDERR
"no entry for $type found in gene_archive table "
158 .
"- returning undef\n" );
165 #-------------------------------------------------------------------------------
168 my ( $dbi, $type ) = @_;
170 my ( $highest_from_current, $highest_from_archive );
172 # Get highest stable ID from the relevant table.
174 my $sth = $dbi->prepare(
"SELECT MAX(stable_id) FROM $type WHERE stable_id LIKE 'ENS%'");
177 if ( my @row = $sth->fetchrow_array() ) {
178 $highest_from_current = $row[0];
180 die(
"Can't get max $type stable ID from $type\n");
183 if ( length($highest_from_current) == 0 ) {
185 " Warning ! length of stable_id for $type is zero \n" );
188 if ( $type eq
"exon" ) {
189 # Archive doesn't store information about exon_stable_ids so try
190 # without archive first.
192 if ( length($highest_from_current) == 0 ) {
195 .
"No stable_id for exon found \n"
196 .
"I got no prefix to generate new stable_ids for type $type!!! "
197 .
"- I'll try to use gene_archive now\n" );
204 if ( length($max) > 0 ) {
205 ( $prefix, my $suffix ) = $max =~ /([a-zA-Z]+)([0-9]+)/;
209 .
"No entries in table exon and "
210 .
"gene_archive tables found\n"
211 .
"Don't know which species prefix to use for species.\n" );
213 $highest_from_current = sprintf(
"%s%011d", $prefix, 0 );
215 } ## end
if ( length($highest_from_current...))
217 # remove the 'E' from exon stable id prefix
218 my ( $prefix, $suffix ) = $highest_from_current =~ /([a-zA-Z]+)([0-9]+)/;
221 return $prefix . $suffix;
222 } ## end
if ( $type eq
"exon" )
224 # and from relevant archive
226 $highest_from_archive =
230 ( $highest_from_current ge $highest_from_archive )
231 ? $highest_from_current
232 : $highest_from_archive;
234 if ( length($max) == 0 ) {
236 .
"No stable_id in table gene_archive "
237 .
"or found in $type\_stable_id - tables\n" );
240 # Assuming that this is a correctly formatted stable id -> remove the
241 # G/T/P for gene etc. (Exon dealt with above.)
243 my ( $prefix, $suffix ) = $max =~ /([a-zA-Z]+)([0-9]+)/;
244 if ( $type eq
'gene' ) {
246 } elsif ( $type eq
'transcript' ) {
248 } elsif ( $type eq
'translation' ) {
252 return $prefix . $suffix;
255 #-------------------------------------------------------------------------------
262 generate_stable_ids.pl -dbuser|user {user}
263 -dbpass|pass {password}
268 -start {first stable ID}
270 Argument to -types is a comma-separated list of types of stable IDs to
273 If the -types argument is ommitted, stable IDs are generated
for all
276 Assigns stable IDs to objects that currently have none. The starting
277 stable ID is found by incrementing the highest current stable ID
for
278 that type *or* by
using -start argument. If no -start option is used
279 the script tries to find the latest given stable_id
for each object
280 by looking up the <OBJ>_stable_id tables in the database and the
281 gene_archive table (only
for gene, translation and
transcript, not
for
284 Please note that this script only works for "ENS..."-type stable ids.
285 It does not work for LRG ids or other types of stable ids.
289 The -start option requires to not submit an initial stable ID without
292 ENSMUS000001 (not ENSMUSG0001, then you end up with stable IDs like
293 ENSMUSGG001, ENSMUSGT0001.)
295 Again, the parameter to -start should be the stable ID you wish
301 - to generate only
exon stable IDs starting with 223 for mouse, use
303 -start ENSMUS222 -types
exon
305 - to generate
exon and gene stable IDs, starting with 223 for mouse,
308 -start ENSMUS222 -types
exon,gene
310 - to generate all types of stable IDs for human, which all start
316 translation, gene) starting with 1 for an organism with prefix
317 ENSNEW you can use one of the following options :
321 -start ENSNEW00000000
323 The script produces SQL which can be
run against the target database.