ensembl-hive  2.7.0
repeat-libraries.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 #
18 # Repeat classification script
19 # based on js5's lite database repeat-libraries script
20 #
21 # script repeat-libraries.pl <UNUSED>
22 #
23 # This script is used to do run on (old) v19 databases to get the
24 # repeat class from the repeat name before categorising them into types.
25 # It is not used for any other purpose anymore. Repeat classification on
26 # newer v32 databases is done with repeat-types.pl
27 #
28 
29 use strict;
30 use warnings;
31 
32 
33 use DBI;
34 use Getopt::Long;
35 
36 my ( $host, $user, $pass, $port, $expression, $dbpattern, $repeatfile, $help );
37 
38 GetOptions( "host=s", \$host,
39  "user=s", \$user,
40  "pass=s", \$pass,
41  "port=i", \$port,
42  "repeatfile=s", \$repeatfile,
43  "dbpattern=s", \$dbpattern,
44  "help", \$help
45  );
46 
47 if($help) {
48  usage();
49 }
50 
51 if( !$host ) {
52  print STDERR "-host argument is required\n";
53  usage();
54 }
55 
56 if( !$dbpattern ) {
57  print STDERR "-dbpattern argument is required\n";
58  usage();
59 }
60 
61 if( !$repeatfile) {
62  print STDERR "-repeatfile argument is required\n";
63  usage();
64 }
65 
66 my $dsn = "DBI:mysql:host=$host";
67 if( $port ) {
68  $dsn .= ";port=$port";
69 }
70 
71 my $dbh = DBI->connect( $dsn, $user, $pass );
72 
73 my @dbnames = map {$_->[0] } @{ $dbh->selectall_arrayref( "show databases" ) };
74 
75 my @dbs = grep {$_ =~ /$dbpattern/} @dbnames;
76 
77 foreach my $db (@dbs) {
78  open RFILE, $repeatfile or die("Could not open repeat file $repeatfile");
79 
80  print STDERR "$db\n";
81 
82  $dbh->do("use $db");
83 
84  print STDERR " Clearing repeat_class\n";
85 
86  $dbh->do("update repeat_consensus set repeat_class = ''");
87 
88  print STDERR " Reading specific repeat classes from input file\n";
89 
90 
91  my $C=0;
92  while(<RFILE>) {
93  chomp;
94  my($hid,$type) = split( /\t/, $_, 2);
95  $dbh->do("update repeat_consensus set repeat_class = ? where repeat_name in (?,?,?)", {} , $type, $hid, substr($hid,0,15), "$hid-int" );
96  $C++;
97  print STDERR "$C\n" unless $C % 100;
98  }
99 
100  close RFILE;
101 
102  print STDERR " Consensifying remaining repeat classes\n";
103 
104  $dbh->do("update repeat_consensus set repeat_class = 'Simple_repeat' where repeat_class= '' and repeat_name like '%)n'" );
105  $dbh->do("update repeat_consensus set repeat_class = 'low_complexity' where repeat_class= '' and repeat_name like '%-rich'" );
106  $dbh->do("update repeat_consensus set repeat_class = 'low_complexity' where repeat_class= '' and repeat_name like 'poly%'" );
107 
108  $dbh->do("update repeat_consensus set repeat_class = 'LTR/ERVL' where repeat_class= '' and repeat_name like '%ERVL%' " );
109  $dbh->do("update repeat_consensus set repeat_class = 'LTR/ERVL' where repeat_class= '' and repeat_name like '%ERV16%' " );
110  $dbh->do("update repeat_consensus set repeat_class = 'SINE/Alu' where repeat_class= '' and repeat_name like 'Alu%' " );
111  $dbh->do("update repeat_consensus set repeat_class = 'SINE/Alu' where repeat_class= '' and repeat_name like '%F_AM%' " );
112  $dbh->do("update repeat_consensus set repeat_class = 'LINE/L1' where repeat_class= '' and repeat_name like 'L1%' " );
113  $dbh->do("update repeat_consensus set repeat_class = 'DNA/MER2_type' where repeat_class= '' and repeat_name like 'Tigger%' " );
114  $dbh->do("update repeat_consensus set repeat_class = 'DNA/MER1_type' where repeat_class= '' and repeat_name like 'Charlie%' " );
115  $dbh->do("update repeat_consensus set repeat_class = 'DNA/Tc2' where repeat_class= '' and repeat_name like 'HsTC%' " );
116 
117 
118  $dbh->do("update repeat_consensus set repeat_class = 'DNA/MER2_type' where repeat_class= '' and repeat_name like 'MER46%' " );
119  $dbh->do("update repeat_consensus set repeat_class = 'DNA/MER2_type' where repeat_class= '' and repeat_name like 'MER7%' " );
120  $dbh->do("update repeat_consensus set repeat_class = 'DNA/MER1_type' where repeat_class= '' and repeat_name like 'MER91' " );
121  $dbh->do("update repeat_consensus set repeat_class = 'DNA/MER1_type' where repeat_class= '' and repeat_name like 'MER58' " );
122  $dbh->do("update repeat_consensus set repeat_class = 'DNA/MER1_type' where repeat_class= '' and repeat_name like 'MER63' " );
123  $dbh->do("update repeat_consensus set repeat_class = 'Satellite/telomeric' where repeat_class= '' and repeat_name like 'SUBTEL_%' " );
124 
125  $dbh->do("update repeat_consensus set repeat_class = 'trf' where repeat_class = '' and repeat_name = 'trf' " );
126  $dbh->do("update repeat_consensus set repeat_class = 'dust' where repeat_class = '' and repeat_name = 'dust'" );
127 
128 
129  # $dbh->do("update repeat_consensus set repeat_class = 'LTR/ERVL' where repeat_class= 'Unknown' and repeat_name like 'MER70%' " );
130  # $dbh->do("update repeat_consensus set repeat_class = 'DNA/AcHobo' where repeat_class= 'Unknown' and repeat_name like 'ORSL' " );
131 
132  print STDERR " Setting repeat types\n";
133 
134  my %mappings = (
135  'Low_Comp%' => 'Low complexity regions',
136  'LINE%' => 'Type I Transposons/LINE',
137  'SINE%' => 'Type I Transposons/SINE',
138  'DNA%' => 'Type II Transposons',
139  'LTR%' => 'LTRs',
140  'Other%' => 'Other repeats',
141  'Satelli%' => 'Satellite repeats',
142  'Simple%' => 'Simple repeats',
143  'Other%' => 'Other repeats',
144  'Tandem%' => 'Tandem repeats',
145  'TRF%' => 'Tandem repeats',
146  'dust%' => 'Dust',
147  'Unknown%' => 'Unknown',
148  '%RNA' => 'RNA repeats',
149  );
150  foreach (keys %mappings) {
151  $dbh->do(qq(update repeat_consensus set repeat_type = '$mappings{$_}' where repeat_class like '$_'));
152  }
153 
154  # type all remaining repeats as unknown
155  $dbh->do(qq(update repeat_consensus set repeat_type = 'Unknown' where repeat_type = ''));
156  $dbh->do(qq(update repeat_consensus set repeat_type = 'Unknown' where repeat_type = null));
157 }
158 
159 print STDERR "All done.\n";
160 
161 $dbh->disconnect;
162 
163 
164 sub usage {
165  print STDERR <<EOF
166 
167 This program classifies the repeats stored in a core database into some
168 somewhat sensible categories. It does this through a combination of a
169 repeat.txt file extracted from RepeatMasker repeat libraries and through
170 some simple pattern matching of the repeat names.
171 
172 usage: perl repeat-libraries.pl [-user <user>] [-port <port>] [-pass <pass>]
173  -host <host> -dbpattern <regexp> -repeatfile <file>
174 
175 example: perl repeat-libraries.pl -user ensadmin -pass secret -host ecs1g \\
176  -dbpattern '^homo_sapiens_(core|vega)_20_34c$' -repeatfile repeats.txt
177 
178 EOF
179 ;
180  exit;
181 }
usage
public usage()
map
public map()