2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
18 # Repeat classification script
19 # based on js5's lite database repeat-libraries script
21 # script repeat-libraries.pl <UNUSED>
23 # This script is used to do run on (old) v19 databases to get the
24 # repeat class from the repeat name before categorising them into types.
25 # It is not used for any other purpose anymore. Repeat classification on
26 # newer v32 databases is done with repeat-types.pl
36 my ( $host, $user, $pass, $port, $expression, $dbpattern, $repeatfile, $help );
38 GetOptions(
"host=s", \$host,
42 "repeatfile=s", \$repeatfile,
43 "dbpattern=s", \$dbpattern,
52 print STDERR
"-host argument is required\n";
57 print STDERR
"-dbpattern argument is required\n";
62 print STDERR
"-repeatfile argument is required\n";
66 my $dsn =
"DBI:mysql:host=$host";
68 $dsn .=
";port=$port";
71 my $dbh = DBI->connect( $dsn, $user, $pass );
73 my @dbnames =
map {$_->[0] } @{ $dbh->selectall_arrayref(
"show databases" ) };
75 my @dbs = grep {$_ =~ /$dbpattern/} @dbnames;
77 foreach my $db (@dbs) {
78 open RFILE, $repeatfile or die(
"Could not open repeat file $repeatfile");
84 print STDERR
" Clearing repeat_class\n";
86 $dbh->do(
"update repeat_consensus set repeat_class = ''");
88 print STDERR
" Reading specific repeat classes from input file\n";
94 my($hid,$type) = split( /\t/, $_, 2);
95 $dbh->do(
"update repeat_consensus set repeat_class = ? where repeat_name in (?,?,?)", {} , $type, $hid, substr($hid,0,15),
"$hid-int" );
97 print STDERR
"$C\n" unless $C % 100;
102 print STDERR
" Consensifying remaining repeat classes\n";
104 $dbh->do(
"update repeat_consensus set repeat_class = 'Simple_repeat' where repeat_class= '' and repeat_name like '%)n'" );
105 $dbh->do(
"update repeat_consensus set repeat_class = 'low_complexity' where repeat_class= '' and repeat_name like '%-rich'" );
106 $dbh->do(
"update repeat_consensus set repeat_class = 'low_complexity' where repeat_class= '' and repeat_name like 'poly%'" );
108 $dbh->do(
"update repeat_consensus set repeat_class = 'LTR/ERVL' where repeat_class= '' and repeat_name like '%ERVL%' " );
109 $dbh->do(
"update repeat_consensus set repeat_class = 'LTR/ERVL' where repeat_class= '' and repeat_name like '%ERV16%' " );
110 $dbh->do(
"update repeat_consensus set repeat_class = 'SINE/Alu' where repeat_class= '' and repeat_name like 'Alu%' " );
111 $dbh->do(
"update repeat_consensus set repeat_class = 'SINE/Alu' where repeat_class= '' and repeat_name like '%F_AM%' " );
112 $dbh->do(
"update repeat_consensus set repeat_class = 'LINE/L1' where repeat_class= '' and repeat_name like 'L1%' " );
113 $dbh->do(
"update repeat_consensus set repeat_class = 'DNA/MER2_type' where repeat_class= '' and repeat_name like 'Tigger%' " );
114 $dbh->do(
"update repeat_consensus set repeat_class = 'DNA/MER1_type' where repeat_class= '' and repeat_name like 'Charlie%' " );
115 $dbh->do(
"update repeat_consensus set repeat_class = 'DNA/Tc2' where repeat_class= '' and repeat_name like 'HsTC%' " );
118 $dbh->do(
"update repeat_consensus set repeat_class = 'DNA/MER2_type' where repeat_class= '' and repeat_name like 'MER46%' " );
119 $dbh->do(
"update repeat_consensus set repeat_class = 'DNA/MER2_type' where repeat_class= '' and repeat_name like 'MER7%' " );
120 $dbh->do(
"update repeat_consensus set repeat_class = 'DNA/MER1_type' where repeat_class= '' and repeat_name like 'MER91' " );
121 $dbh->do(
"update repeat_consensus set repeat_class = 'DNA/MER1_type' where repeat_class= '' and repeat_name like 'MER58' " );
122 $dbh->do(
"update repeat_consensus set repeat_class = 'DNA/MER1_type' where repeat_class= '' and repeat_name like 'MER63' " );
123 $dbh->do(
"update repeat_consensus set repeat_class = 'Satellite/telomeric' where repeat_class= '' and repeat_name like 'SUBTEL_%' " );
125 $dbh->do(
"update repeat_consensus set repeat_class = 'trf' where repeat_class = '' and repeat_name = 'trf' " );
126 $dbh->do(
"update repeat_consensus set repeat_class = 'dust' where repeat_class = '' and repeat_name = 'dust'" );
129 # $dbh->do("update repeat_consensus set repeat_class = 'LTR/ERVL' where repeat_class= 'Unknown' and repeat_name like 'MER70%' " );
130 # $dbh->do("update repeat_consensus set repeat_class = 'DNA/AcHobo' where repeat_class= 'Unknown' and repeat_name like 'ORSL' " );
132 print STDERR
" Setting repeat types\n";
135 'Low_Comp%' =>
'Low complexity regions',
136 'LINE%' =>
'Type I Transposons/LINE',
137 'SINE%' =>
'Type I Transposons/SINE',
138 'DNA%' =>
'Type II Transposons',
140 'Other%' =>
'Other repeats',
141 'Satelli%' =>
'Satellite repeats',
142 'Simple%' =>
'Simple repeats',
143 'Other%' =>
'Other repeats',
144 'Tandem%' =>
'Tandem repeats',
145 'TRF%' =>
'Tandem repeats',
147 'Unknown%' =>
'Unknown',
148 '%RNA' =>
'RNA repeats',
150 foreach (keys %mappings) {
151 $dbh->do(qq(update repeat_consensus set repeat_type =
'$mappings{$_}' where repeat_class like
'$_'));
154 # type all remaining repeats as unknown
155 $dbh->do(qq(update repeat_consensus set repeat_type =
'Unknown' where repeat_type =
''));
156 $dbh->do(qq(update repeat_consensus set repeat_type =
'Unknown' where repeat_type =
null));
159 print STDERR
"All done.\n";
167 This program classifies the repeats stored in a core database into some
168 somewhat sensible categories. It does
this through a combination of a
169 repeat.txt file extracted from RepeatMasker repeat libraries and through
170 some simple pattern matching of the repeat names.
172 usage: perl repeat-libraries.pl [-user <user>] [-port <port>] [-pass <pass>]
173 -host <host> -dbpattern <regexp> -repeatfile <file>
175 example: perl repeat-libraries.pl -user ensadmin -pass secret -host ecs1g \\
176 -dbpattern
'^homo_sapiens_(core|vega)_20_34c$' -repeatfile repeats.txt