ensembl-hive  2.8.1
test_assembly_mapping.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 =head1
19 
20 test_assembly_mapping.pl [arguments]
21 
22 Produce statistics on the quality of feature mappings between two different assemblies.
23 Feature mappings need to be loaded first using script load_feature_mappings.pl
24 Sample config file: test_assembly_mapping.ini.example
25 
26 Required arguments:
27  --host=HOST test database host HOST
28  --port=PORT test database port PORT
29  --user=USER test database username USER
30  --pass=PASS test database passwort PASS
31  --dbname=NAME test database name NAME
32 
33 
34 Optional arguments:
35 
36  --conffile=filename read parameters from FILE
37  (default: conf/Conversion.ini)
38  --logfile, --log=FILE log to FILE (default: *STDOUT)
39  --logpath=PATH write logfile to PATH (default: .)
40  --logappend, --log_append append to logfile (default: truncate)
41 
42 =cut
43 
44 use strict;
45 use warnings;
46 no warnings 'uninitialized';
47 
48 use FindBin qw($Bin);
49 use Getopt::Long;
50 use Pod::Usage;
53 
54 
55 use vars qw($SERVERROOT);
56 
57 BEGIN {
58  $SERVERROOT = "$Bin/../";
59 }
60 
61 
62 $| = 1;
63 
64 my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
65 
66 # parse options
67 $support->parse_common_options(@_);
68 
69 $support->allowed_params(qw(dbname host port user pass conffile logfile logpath logappend));
70 
71 if ($support->param('help') or $support->error) {
72  warn $support->error if $support->error;
73  pod2usage(1);
74 }
75 
76 # ask user to confirm parameters to proceed
77 $support->confirm_params;
78 
79 # get log filehandle and print heading and parameters to logfile
80 $support->init_log;
81 
82 $support->check_required_params(qw(dbname host port user));
83 
84 my ($dba, $dbh, $sql, $sth);
85 $dbh = $support->get_dbconnection();
86 
87 
88 my %mapping_quality = ( 1 => 'with gaps',
89  2 => 'without gaps');
90 
91 #get chromosomes
92 
93 my @chrs = @{$dbh->selectcol_arrayref("select distinct old_chr from mapping order by old_chr+0")};
94 
95 foreach my $chr (@chrs) {
96 
97  my @feature_types = @{$dbh->selectcol_arrayref("select distinct feature_type from mapping where old_chr = '". $chr . "' order by feature_type")};
98 
99  foreach my $feature_type (@feature_types) {
100  $support->log_stamped("processing $feature_type mappings for chromosome $chr\n\n",1);
101 
102 
103  #count all features in the old db
104  $sql = "select count(1) from mapping where feature_type = '" . $feature_type . "' and old_chr = '". $chr . "'" ;
105  my ($total_features_old) = $dbh->selectrow_array($sql);
106 
107  #count all features both in the old and new db
108  $sql = "select count(1) from mapping where feature_type = '" . $feature_type . "' and feature_found_in_new_db = 1 and old_chr = '". $chr . "'";
109  my ($total_features_old_new) = $dbh->selectrow_array($sql);
110 
111  my $perc_features = ($total_features_old_new/$total_features_old) * 100;
112  $perc_features = sprintf "%.2f", $perc_features;
113 
114  $support->log_stamped("$total_features_old_new $feature_type" . "s found in both old and new assembly dbs ($perc_features\% in old assembly db found in new db)\n\n", 1);
115 
116  #count features which have the same length in the new db
117  $sql = "select count(1) from mapping where feature_type = '" . $feature_type . "' and feature_found_in_new_db = 1 and old_length = new_length and old_chr = '". $chr . "'";
118 
119  my ($total_same_length) = $dbh->selectrow_array($sql);
120 
121  my $ok_mappings_1 = 0;
122  my $perc_mappings_1 = 0;
123 
124  if ($total_same_length > 0) {
125  #count mappings where start and end match feature location in the new db by mapping quality (1-contains gaps, 2 - no gaps
126 
127  $support->log_stamped("$total_same_length $feature_type" . "s where feature length in the old assembly db is the same as in the new assembly db\n\n", 2);
128 
129  $sql = "select count(1) as mapping_count,mapping_quality from mapping where feature_type = '" . $feature_type . "' and old_chr = '". $chr . "' and feature_found_in_new_db = 1 and old_length = new_length and new_start = mapping_start and new_end = mapping_end and (new_strand = mapping_strands or mapping_strands = 'both') and (mapping_chrs = new_chr or mapping_chrs like concat(new_chr,',%')) group by mapping_quality";
130 
131  my %mapping_counts = %{$dbh->selectall_hashref($sql,"mapping_quality")};
132 
133  my $ok_mappings = 0;
134 
135  foreach my $mapping_q (keys %mapping_counts) {
136  my $perc = ($mapping_counts{$mapping_q}{'mapping_count'}/$total_same_length) * 100;
137  $perc = sprintf "%.2f", $perc;
138 
139  $support->log_stamped("$mapping_counts{$mapping_q}{'mapping_count'} $feature_type mappings $mapping_quality{$mapping_q} where feature length in the old assembly db is the same as in the new assembly db and mapping start and end match feature location in the new db ($perc\%)\n\n", 3);
140 
141  $ok_mappings += $mapping_counts{$mapping_q}{'mapping_count'};
142 
143  }
144 
145  $ok_mappings_1 += $ok_mappings;
146  my $perc_mappings = ($ok_mappings/$total_same_length) * 100;
147  $perc_mappings = sprintf "%.2f", $perc_mappings;
148 
149  $support->log_stamped("total: $ok_mappings ok $feature_type mappings ($perc_mappings\%):\n\n", 3);
150 
151  } else {
152  $support->log_stamped("no $feature_type" . "s found for chromosome $chr where feature length in the old assembly db is the same as in the new assembly db\n\n", 2);
153  }
154 
155  $sql = "select count(1) from mapping where feature_type = '" . $feature_type . "' and old_chr = '". $chr . "' and feature_found_in_new_db = 1 and old_length != new_length";
156 
157  my ($total_diff_length) = $dbh->selectrow_array($sql);
158 
159  if ($total_diff_length > 0) {
160 
161  $support->log_stamped("$total_diff_length $feature_type" . "s found for chromosome $chr where feature length in the old assembly db is different than in the new assembly db\n\n", 2);
162 
163  #count features with different length in the new db where either mapping start or end match new feature start or end respectively, by mapping quality
164 
165  $sql = "select count(1) as mapping_count,mapping_quality from mapping where feature_type = '" . $feature_type . "' and old_chr = '". $chr . "' and feature_found_in_new_db = 1 and old_length != new_length and (new_start = mapping_start or new_end = mapping_end) and (new_strand = mapping_strands or mapping_strands = 'both') and (mapping_chrs = new_chr or mapping_chrs like concat(new_chr,',%')) group by mapping_quality";
166 
167  my %mapping_counts = %{$dbh->selectall_hashref($sql,"mapping_quality")};
168 
169  my $ok_mappings = 0;
170 
171  foreach my $mapping_q (keys %mapping_counts) {
172  my $perc = ($mapping_counts{$mapping_q}{'mapping_count'}/$total_diff_length) * 100;
173  $perc = sprintf "%.2f", $perc;
174 
175  $support->log_stamped("$mapping_counts{$mapping_q}{'mapping_count'} $feature_type mappings $mapping_quality{$mapping_q} found for chromosome where feature length in the old assembly db is different than in the new assembly db and either mapping start or end matches the gene location in the new db ($perc\%)\n\n", 3);
176 
177  $ok_mappings += $mapping_counts{$mapping_q}{'mapping_count'};
178 
179  }
180 
181  #count features with different length in the new db where either mapping start or end is within the difference between old and new length from the new feature start or end respectively, by mapping quality
182  $sql = "select count(1) as mapping_count,mapping_quality from mapping where feature_type = '" . $feature_type . "' and old_chr = '". $chr . "' and feature_found_in_new_db = 1 and old_length != new_length and (abs(new_start - mapping_start) <= abs(new_length - old_length) or abs(new_end - mapping_end) <= abs(new_length - old_length)) and new_start != mapping_start and new_end != mapping_end and (new_strand = mapping_strands or mapping_strands = 'both') and (mapping_chrs = new_chr or mapping_chrs like concat(new_chr,',%')) group by mapping_quality";
183 
184  %mapping_counts = %{$dbh->selectall_hashref($sql,"mapping_quality")};
185 
186 
187  foreach my $mapping_q (keys %mapping_counts) {
188  my $perc = ($mapping_counts{$mapping_q}{'mapping_count'}/$total_diff_length) * 100;
189  $perc = sprintf "%.2f", $perc;
190 
191  $support->log_stamped("$mapping_counts{$mapping_q}{'mapping_count'} $feature_type mappings $mapping_quality{$mapping_q} found for chromosome where feature length in the old assembly db is different than in the new assembly db and either mapping start or end is within the difference between old and new feature length from the new feature start or end, respectively ($perc\%)\n\n", 3);
192  $ok_mappings += $mapping_counts{$mapping_q}{'mapping_count'};
193 
194  }
195 
196  my $perc_mappings = ($ok_mappings/$total_diff_length) * 100;
197  $perc_mappings = sprintf "%.2f", $perc_mappings;
198  $support->log_stamped("total: $ok_mappings ok $feature_type mappings ($perc_mappings\%):\n\n", 3);
199 
200  $ok_mappings_1 += $ok_mappings;
201 
202  } else {
203  $support->log_stamped("no $feature_type" . "s found for chromosome $chr where feature length in the old assembly db is different than in the new assembly db\n\n", 2);
204  }
205 
206  $perc_mappings_1 = ($ok_mappings_1/$total_features_old_new) * 100;
207  $perc_mappings_1 = sprintf "%.2f", $perc_mappings_1;
208  $support->log_stamped("total: $ok_mappings_1 ok $feature_type mappings ($perc_mappings_1\%):\n\n", 1);
209 
210 
211  }
212 
213 }
214 
215 # finish logfile
216 $support->finish_log;
Bio::EnsEMBL::Utils::ConversionSupport
Definition: ConversionSupport.pm:39
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
BEGIN
public BEGIN()