ensembl-hive  2.8.1
vega_repeat_libraries.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 =head1 NAME
19 
20 vega_repeat_libraries.pl - set repeat_consensus.repeat_class
21 
22 =head1 SYNOPSIS
23 
24 vega_repeat_libraries.pl [options]
25 
26 General options:
27  --conffile, --conf=FILE read parameters from FILE
28  (default: conf/Conversion.ini)
29 
30  --dbname, db_name=NAME use database NAME
31  --host, --dbhost, --db_host=HOST use database host HOST
32  --port, --dbport, --db_port=PORT use database port PORT
33  --user, --dbuser, --db_user=USER use database username USER
34  --pass, --dbpass, --db_pass=PASS use database passwort PASS
35  --logfile, --log=FILE log to FILE (default: *STDOUT)
36  --logpath=PATH write logfile to PATH (default: .)
37  --logappend, --log_append append to logfile (default: truncate)
38  -v, --verbose verbose logging (default: false)
39  -i, --interactive=0|1 run script interactively (default: true)
40  -n, --dry_run, --dry=0|1 don't write results to database
41  -h, --help, -? print help (this message)
42 
43  --prune undo, i.e. delete from the database changes caused by running the script
44 
45 
46 Specific options:
47 
48  --repeatfile=FILE read repeat class definitions from FILE
49 
50 =head1 DESCRIPTION
51 
52 This program classifies the repeats stored in a core database into some
53 somewhat sensible categories. It does this through a combination of a
54 repeat.txt file extracted from RepeatMasker repeat libraries and through some
55 simple pattern matching of the repeat names.
56 
57 
58 =head1 AUTHOR
59 
60 Steve Trevanion <st3@sanger.ac.uk>
61 Patrick Meidl <pm2@sanger.ac.uk>
62 
63 Based on code by James Smith <js5@sanger.ac.uk>
64 
65 =head1 CONTACT
66 
67 Post questions to the EnsEMBL development list http://lists.ensembl.org/mailman/listinfo/dev
68 
69 =cut
70 
71 use strict;
72 use warnings;
73 no warnings 'uninitialized';
74 
75 use FindBin qw($Bin);
76 use vars qw($SERVERROOT);
77 
78 BEGIN {
79  $SERVERROOT = "$Bin/../../..";
80  unshift(@INC, "$SERVERROOT/ensembl-otter/modules");
81  unshift(@INC, "$SERVERROOT/ensembl/modules");
82  unshift(@INC, "$SERVERROOT/bioperl-live");
83 }
84 
85 use Getopt::Long;
86 use Pod::Usage;
87 use Bio::EnsEMBL::Utils::ConversionSupport;
88 
89 $| = 1;
90 
91 my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
92 
93 # parse options
94 $support->parse_common_options(@_);
95 $support->parse_extra_options('repeatfile=s', 'prune');
96 $support->allowed_params($support->get_common_params, 'repeatfile', 'prune');
97 
98 if ($support->param('help') or $support->error) {
99  warn $support->error if $support->error;
100  pod2usage(1);
101 }
102 
103 # ask user to confirm parameters to proceed
104 $support->confirm_params;
105 
106 # get log filehandle and print heading and parameters to logfile
107 $support->init_log;
108 
109 $support->check_required_params('repeatfile') unless $support->param('prune'); # don't need the repeat file for pruning
110 
111 # connect to database and get adaptors
112 my $dba = $support->get_database('ensembl');
113 my $dbh = $dba->dbc->db_handle;
114 
115 # unless we are pruning (undo), we should make a backup copy of the repeat_consensus table
116 if($support->param('prune')){
117  # prune (undo mode)
118  # backup table must exist for this to work
119 
121  # backup table present
122  if($support->user_proceed("Replace the current table 'repeat_consensus' with the backup table 'repeat_consensus_backup'?")){
123  if($dbh->do("drop table repeat_consensus")){
124  if($dbh->do("create table repeat_consensus select * from repeat_consensus_backup")){
125  $support->log("prune (undo) was successful\n");
126  $support->log_stamped("Done.\n");
127 
128  # finish logfile
129  $support->finish_log;
130  exit(0);
131  }
132  else {
133  $support->log_error("prune failed\n");
134  }
135  }
136  else {
137  $support->log_error("prune failed\n");
138  }
139  }
140  else{
141 
142  #user is aborting
143  print "aborting...\n";
144  $support->log_error("aborting...\n");
145  }
146  }
147  else{
148  print "Cannot do prune, as no backup table\n";
149  $support->log_error("Cannot do prune, as no backup table\n");
150  }
151 }
152 else{
153 
154  # normal run
155  # check to see if the backup table 'repeat_consensus_backup' already exists
157  #table already exists: ask user if OK to overwrite it
158  if ($support->user_proceed("The backup table 'repeat_consensus_backup' already exists, OK to delete?")) {
159  if($dbh->do("drop table 'repeat_consensus_backup'")){
160  $support->log("deleted previous backup table\n");
162  }
163  else{
164  $support->log_error("tried but failed to delete previous backup table\n");
165  }
166  }
167  else{
168  # user won't allow removing the backup table
169  print "Aborting ...\n";
170  $support->log_error("User won't allow removal of backup table ... aborting program\n");
171  }
172  }else{
173  # table doesn't exist, therefore we can create it
175  }
176 }
177 
178 
179 # mouse fixes
180 if ($support->species eq 'Mus_musculus') {
181  $support->log("Making Vega mouse specific changes...\n");
182  $support->log("Copying repeat_name to repeat_consensus...\n", 1);
183  $dbh->do("update repeat_consensus set repeat_consensus = repeat_name where repeat_class = 'Tandem_repeat'") unless ($support->param('dry_run'));
184  $support->log("Setting repeat_name to 'trf' where appropriate\n", 1);
185  $dbh->do("update repeat_consensus set repeat_name = 'trf' where repeat_class = 'Tandem_repeat'") unless ($support->param('dry_run'));
186  $support->log("Done.\n");
187 }
188 
189 # clear repeat_class
190 $support->log("Clearing repeat_class...\n");
191 $dbh->do("update repeat_consensus set repeat_class = ''") unless ($support->param('dry_run'));
192 $support->log("Done.\n");
193 
194 # read repeat classes from file
195 $support->log_stamped("Reading repeat classes from input file...\n");
196 my $fh = $support->filehandle('<', $support->param('repeatfile'));
197 my $C = 0;
198 while (<$fh>) {
199  chomp;
200  my ($hid, $type) = split( /\t/, $_, 2);
201  $dbh->do("update repeat_consensus set repeat_class = ? where repeat_name in (?,?,?)", {} , $type, $hid, substr($hid,0,15), "$hid-int" ) unless ($support->param('dry_run'));
202  $C++;
203  $support->log("$C\n", 1) unless $C % 100;
204 }
205 close $fh;
206 $support->log_stamped("Done.\n");
207 
208 # Consensifying repeat classes
209 $support->log_stamped("Consensifying remaining repeat classes...\n");
210 unless ($support->param('dry_run')) {
211  $dbh->do("update repeat_consensus set repeat_class = 'Simple_repeat' where repeat_class= '' and repeat_name like '%)n'" );
212  $dbh->do("update repeat_consensus set repeat_class = 'low_complexity' where repeat_class= '' and repeat_name like '%-rich'" );
213  $dbh->do("update repeat_consensus set repeat_class = 'low_complexity' where repeat_class= '' and repeat_name like 'poly%'" );
214  $dbh->do("update repeat_consensus set repeat_class = 'LTR/ERVL' where repeat_class= '' and repeat_name like '%ERVL%' " );
215  $dbh->do("update repeat_consensus set repeat_class = 'LTR/ERVL' where repeat_class= '' and repeat_name like '%ERV16%' " );
216  $dbh->do("update repeat_consensus set repeat_class = 'SINE/Alu' where repeat_class= '' and repeat_name like 'Alu%' " );
217  $dbh->do("update repeat_consensus set repeat_class = 'SINE/Alu' where repeat_class= '' and repeat_name like '%F_AM%' " );
218  $dbh->do("update repeat_consensus set repeat_class = 'LINE/L1' where repeat_class= '' and repeat_name like 'L1%' " );
219  $dbh->do("update repeat_consensus set repeat_class = 'DNA/MER2_type' where repeat_class= '' and repeat_name like 'Tigger%' " );
220  $dbh->do("update repeat_consensus set repeat_class = 'DNA/MER1_type' where repeat_class= '' and repeat_name like 'Charlie%' " );
221  $dbh->do("update repeat_consensus set repeat_class = 'DNA/Tc2' where repeat_class= '' and repeat_name like 'HsTC%' " );
222  $dbh->do("update repeat_consensus set repeat_class = 'DNA/MER2_type' where repeat_class= '' and repeat_name like 'MER46%' " );
223  $dbh->do("update repeat_consensus set repeat_class = 'DNA/MER2_type' where repeat_class= '' and repeat_name like 'MER7%' " );
224  $dbh->do("update repeat_consensus set repeat_class = 'DNA/MER1_type' where repeat_class= '' and repeat_name like 'MER91' " );
225  $dbh->do("update repeat_consensus set repeat_class = 'DNA/MER1_type' where repeat_class= '' and repeat_name like 'MER58' " );
226  $dbh->do("update repeat_consensus set repeat_class = 'DNA/MER1_type' where repeat_class= '' and repeat_name like 'MER63' " );
227  $dbh->do("update repeat_consensus set repeat_class = 'Satellite/telomeric' where repeat_class= '' and repeat_name like 'SUBTEL_%' " );
228  $dbh->do("update repeat_consensus set repeat_class = 'trf' where repeat_class = '' and repeat_name = 'trf' " );
229  $dbh->do("update repeat_consensus set repeat_class = 'dust' where repeat_class = '' and repeat_name = 'dust'" );
230  $dbh->do("update repeat_consensus set repeat_class = 'novel_transposon' where repeat_class = '' and repeat_name = 'novel_transposon'");
231 }
232 $support->log_stamped("Done.\n");
233 
234 # Setting repeat types
235 $support->log_stamped("Setting repeat types...\n");
236 my %mappings = (
237  'Low_Comp%' => 'Low complexity regions',
238  'LINE%' => 'Type I Transposons/LINE',
239  'SINE%' => 'Type I Transposons/SINE',
240  'DNA%' => 'Type II Transposons',
241  'LTR%' => 'LTRs',
242  'Other%' => 'Other repeats',
243  'Satelli%' => 'Satellite repeats',
244  'Simple%' => 'Simple repeats',
245  'Other%' => 'Other repeats',
246  'Tandem%' => 'Tandem repeats',
247  'TRF%' => 'Tandem repeats',
248  'dust%' => 'Dust',
249  'Unknown%' => 'Unknown',
250  '%RNA' => 'RNA repeats',
251  'novel_transposon' => 'Novel Transposon',
252 );
253 unless ($support->param('dry_run')) {
254  foreach (keys %mappings) {
255  $dbh->do(qq(update repeat_consensus set repeat_type = '$mappings{$_}' where repeat_class like '$_'));
256  }
257 
258  # type all remaining repeats as unknown
259  $dbh->do(qq(update repeat_consensus set repeat_type = 'Unknown' where repeat_type = ''));
260  $dbh->do(qq(update repeat_consensus set repeat_type = 'Unknown' where repeat_type = NULL));
261 }
262 $support->log_stamped("Done.\n");
263 
264 # finish logfile
265 $support->finish_log;
266 
267 
269  if($dbh->do("create table repeat_consensus_backup select * from repeat_consensus")){
270  $support->log("backup table 'repeat_consensus_backup was created successfully\n");
271  }
272  else{
273  $support->log_error("failed to create backup table 'repeat_consensus_backup'\n");
274  }
275 }
276 
278  # check to see if the backup table 'repeat_consensus_backup' already exists
279  my @tables = $dbh->tables();
280  my $found=0;
281 
282  foreach my $table(@tables){
283  #print "$table\n";
284 
285  if($table eq '`repeat_consensus_backup`'){
286  $found=1;
287  last;
288  }
289  }
290  return $found;
291 }
check_for_backup_table
public check_for_backup_table()
run
public run()
make_backup_table
public make_backup_table()