ensembl-hive  2.7.0
populate_protein_feature.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 #The script populates the CORE db's protein_feature table with all the uniprot mappings found in GIFTS db's ensp_u_cigar table.
18 
19 
20 use strict;
21 use warnings;
22 use DBI qw( :sql_types );
23 use Getopt::Long;
25 use Data::Dumper;
26 
27 my ( $gfhost, $gfport, $gfuser, $gfpass, $gfdb );
28 my ( $pfhost, $pfport, $pfuser, $pfpass, $pfdb );
29 
30 my $create_index = 0;
31 my $dryrun = 0;
32 my $limit = 0;
33 
34 
35 GetOptions(
36  "gfhost=s" => \$gfhost,
37  "gfport=i" => \$gfport,
38  "gfuser=s" => \$gfuser,
39  "gfpass=s" => \$gfpass,
40  "gfdb=s" => \$gfdb,
41 
42  "pfhost=s" => \$pfhost,
43  "pfport=i" => \$pfport,
44  "pfuser=s" => \$pfuser,
45  "pfpass=s" => \$pfpass,
46  "pfdb=s" => \$pfdb,
47 
48  "limit=i" => \$limit,
49  "dryrun!" => \$dryrun,
50  "help", \&usage,
51 
52 );
53 
54 usage() if ( !defined $gfhost || !defined $gfport || !defined $gfuser || !defined $gfdb );
55 usage() if ( !defined $pfhost || !defined $pfport || !defined $pfuser || !defined $pfpass );
56 
57 my ( $giftsDB, $protfeatureDB ) = init_db();
58 die unless defined($giftsDB) and defined($protfeatureDB);
59 
60 print "giftsDB credentials:\n";
61 print Dumper($giftsDB);
62 
63 print "protfeatureDB credentials:\n";
64 print Dumper($protfeatureDB);
65 
66 my $registry = get_registry();
67 my $translation_adaptor = $registry->get_adaptor( 'human', 'core', 'translation' );
68 
69 # Main method call
71 
72 sub init_db {
73 
74 # This is where the gifts info will be read from
75  my $giftsDB = {
76  user => $gfuser,
77  pass => $gfpass,
78  host => $gfhost,
79  port => $gfport,
80  dbname => $gfdb
81 
82  };
83 
84 # This is where the gifts info will be written to
85  my $protfeatureDB = {
86  user => $pfuser,
87  pass => $pfpass,
88  host => $pfhost,
89  port => $pfport,
90  dbname => $pfdb
91  };
92 
93  return ( $giftsDB, $protfeatureDB );
94 }
95 
96 #get the analysis_id for gifts_import
97 sub get_analysis_id {
98  my ($logic_name) = @_;
99  my $analysis_id;
100 
101  my $dba = get_dba($protfeatureDB);
102  my $analysis_dba = $dba->get_AnalysisAdaptor();
103  my $analysis = $analysis_dba->fetch_by_logic_name($logic_name);
104 
105  $analysis_id = $analysis->dbID() if defined $analysis;
106  return $analysis_id;
107 }
108 
109 
111 
112  my $gifts_dbh = db_connect($giftsDB );
113  my $protfeature_dbh = db_connect($protfeatureDB);
114 
115  if($gifts_dbh){
116  print "Got connection to GiftsDB \n";
117  }
118 
119  if($protfeature_dbh){
120  print "Got connection to ProtfeatureDB \n";
121  }
122 
123  #container to hold the sql statements in batches (default: 1000)
124  my $insert_container = build_insert_sql($gifts_dbh, $protfeature_dbh);
125 
126  my $rows_inserted = 0;
127  my $rows_failed = 0;
128  my @failed_stmts = undef;
129 
130  unless($dryrun){
131 
132  foreach my $insert_stmt (@$insert_container) {
133  eval{
134  my $affected = $protfeature_dbh->do($insert_stmt);
135  $rows_inserted += $affected;
136  print "Affeced $affected Rows inserted $rows_inserted\n";
137  };
138  if($@){
139  print "$@\n";
140  $rows_failed++;
141  push(@failed_stmts, $@);
142  }
143 
144  }
145  print "FAILED STATEMENTS\n";
146  print Dumper(\@failed_stmts);
147  print "INSERT END\n";
148  }
149 
150  print "Number of rows inserted into protein_feature $protfeatureDB->{'dbname'}.protein_feature ", $rows_inserted, "\n";
151  print "Number of rows with insertion failed into protein_feature $protfeatureDB->{'dbname'}.protein_feature ", $rows_failed, "\n";
152 
153  $gifts_dbh->disconnect();
154  $protfeature_dbh->disconnect();
155 
156  return;
157 }
158 
159 #annotate the protein feature object with translation dbID, align_type along with other info fetched from giftsdb
160 sub annotate_ref{
161  my $ref = shift;
162 
163  #make a copy
164  my $protein_feature = undef;
165  my $ensp_id = $ref->[0];
166  my $ensp = $translation_adaptor->fetch_by_stable_id( $ensp_id ) ;
167  my $analysis_id = get_analysis_id("gifts_import");
168 
169  if($ensp){
170  $protein_feature->{'translation_id'} = $ensp->dbID; # ENSP dbID
171  $protein_feature->{'seq_start'} = 1;
172  $protein_feature->{'seq_end'} = $ensp->length() - 1;
173  $protein_feature->{'hit_start'} = 1;
174  $protein_feature->{'hit_end'} = $ensp->length() - 1;
175  $protein_feature->{'hit_name'} = $ref->[3];
176  $protein_feature->{'analysis_id'} = $analysis_id;
177  $protein_feature->{'cigar_line'} = $ref->[5];
178  $protein_feature->{'align_type'} = 'mdtag';
179  }
180 
181  return $protein_feature;
182 }
183 
184 #builds the multi-row insert statements in batch mode
185 sub build_insert_sql {
186  my ($gifts_dbh, $protfeature_dbh) = @_;
187 
188  #get analysis id
189  my $analysis_id = get_analysis_id("gifts_import");
190  print("Analysis id $analysis_id \n");
191 
192  my $select_sql = qq{SELECT DISTINCT ensp_id, 0, 0, concat_ws('.',uniprot_acc, uniprot_seq_version), $analysis_id as analysis_id, mdz, 'mdtag' FROM ensp_u_cigar};
193  $select_sql = $limit ? $select_sql . " limit $limit" : $select_sql;
194 
195  my $insert_sql = qq{INSERT INTO $protfeatureDB->{'dbname'}.protein_feature(translation_id, seq_start, seq_end, hit_start, hit_end, hit_name, analysis_id, cigar_line, align_type) VALUES };
196 
197  my $import_sth = $gifts_dbh->prepare($select_sql);
198  $import_sth->execute;
199  my $rows = $import_sth->rows;
200 
201  my $start = 0;
202  my $insert_values;
203  my $max_rows_in_insert = 999; #Batch size of 1000
204  my $rows_in_insert = 0;
205 
206  my @insert_container = ();
207  my $miss = 0;
208  my $not_ensp =0;
209  while ( my $ref = $import_sth->fetchrow_arrayref ) {
210 
211  if($ref->[0] !~ /^ENSP.*/){
212  warn "\t\t ensp id not in right format for ", $ref->[0], " Uniprot id ", $ref->[3], "\n";
213  $not_ensp++;
214  next;
215  }
216 
217  #add additinal protein_feture annotations to the ref object
218  my $protein_feature = annotate_ref($ref);
219 
220  unless($protein_feature){
221  warn "No translation record found in core translation for ", $ref->[0], " Uniprot id ", $ref->[3], "\n";
222  $miss++;
223  next;
224  }
225 
226  #multi-row insertions limited by the batch size
227  if ( $rows_in_insert < $max_rows_in_insert ) {
228  $insert_values .= ',' if $start++;
229  $insert_values .= '(' . $gifts_dbh->quote($protein_feature->{'translation_id'}) . ',' .
230  $gifts_dbh->quote($protein_feature->{'seq_start'}) . ',' .
231  $gifts_dbh->quote($protein_feature->{'seq_end'}) . ',' .
232  $gifts_dbh->quote($protein_feature->{'hit_start'}) . ',' .
233  $gifts_dbh->quote($protein_feature->{'hit_end'}) . ',' .
234  $gifts_dbh->quote($protein_feature->{'hit_name'}) . ',' .
235  $gifts_dbh->quote($protein_feature->{'analysis_id'}) . ',' .
236  $gifts_dbh->quote($protein_feature->{'cigar_line'}) . ',' .
237  $gifts_dbh->quote($protein_feature->{'align_type'}) . ')';
238  $rows_in_insert++;
239  }
240  else {
241  $insert_values .= ',' if $start++;
242  $insert_values .= '(' . $gifts_dbh->quote($protein_feature->{'translation_id'}) . ',' .
243  $gifts_dbh->quote($protein_feature->{'seq_start'}) . ',' .
244  $gifts_dbh->quote($protein_feature->{'seq_end'}) . ',' .
245  $gifts_dbh->quote($protein_feature->{'hit_start'}) . ',' .
246  $gifts_dbh->quote($protein_feature->{'hit_end'}) . ',' .
247  $gifts_dbh->quote($protein_feature->{'hit_name'}) . ',' .
248  $gifts_dbh->quote($protein_feature->{'analysis_id'}) . ',' .
249  $gifts_dbh->quote($protein_feature->{'cigar_line'}) . ',' .
250  $gifts_dbh->quote($protein_feature->{'align_type'}) . ')';
251 
252  my $insert_sql_with_values = $insert_sql . $insert_values;
253  push( @insert_container, $insert_sql_with_values );
254 
255  $insert_values = undef;
256  $rows_in_insert = 0;
257  $start = 0;
258  }
259 }
260 
261  #if the total number of rows is less than the batch size, you will reach here
262  if ($insert_values) {
263  my $insert_sql_with_values = $insert_sql . $insert_values;
264  push( @insert_container, $insert_sql_with_values );
265  }
266 
267  print "Number of rows fetched from GIFTS $rows \n";
268  print "Number of rows missed from GIFTS $miss \n";
269  print "Number of rows with wrong ENSP assignment from GIFTS $not_ensp \n";
270 
271  return \@insert_container;
272 
273 }
274 
275 #===========DB Connection routines=============
276 sub db_connect {
277  my ($connectDB) = @_;
278 
279  my $dbname = $connectDB->{'dbname'};
280  my $host = $connectDB->{'host'};
281  my $user = $connectDB->{'user'};
282  my $pass = $connectDB->{'pass'};
283  my $port = $connectDB->{'port'};
284 
285  my $dsn = "DBI:mysql:host=$host;";
286  if ($port) {
287  $dsn .= "port=$port;";
288  }
289  $dsn .= "database=$dbname";
290 
291  my $dbh = DBI->connect( $dsn, $user, $pass,
292  { 'PrintError' => 1, 'RaiseError' => 1 } );
293 
294  if ( !$dbh ) {
295  die "ERROR: $DBI::errstr";
296  }
297 
298  return $dbh;
299 }
300 
301 sub get_dba{
302  my ($connectDB) = @_;
303 
304  my $dbname = $connectDB->{'dbname'};
305  my $host = $connectDB->{'host'};
306  my $user = $connectDB->{'user'};
307  my $pass = $connectDB->{'pass'};
308  my $port = $connectDB->{'port'};
309 
310  my $dba = new Bio::EnsEMBL::DBSQL::DBAdaptor(
311  -host => $host,
312  -user => $user,
313  -dbname => $dbname,
314  -pass => $pass,
315  -port => $port
316  );
317 return $dba;
318 }
319 
320 sub get_registry {
321 
322  my $registry = "Bio::EnsEMBL::Registry";
323 
324  $registry->load_registry_from_db(
325  -host => $protfeatureDB->{'host'},
326  -user => $protfeatureDB->{'user'},
327  -pass => $protfeatureDB->{'pass'},
328  -port => $protfeatureDB->{'port'}
329 );
330 
331  return $registry;
332 }
333 
334 
335 
336 sub usage {
337  my $indent = ' ' x length($0);
338  print << 'EOF'; exit(0);
339 
340 The script populates the CORE db's protein_feature table with all the uniprot mappings found in GIFTS db's ensp_u_cigar table.
341 
342 
343 Options -gfhost -gfport -gfuser -gfpass -gfdb are mandatory and specify the credentials for the server on which a GIFTS database exists
344 Options -pfhost -pfport -pfuser -pfpass -pfdb are credentials of the core db where the GIFTS data will be copied into
345 
346 To run the script cd into the directory where the script lives and run the script:
347 cd ensembl/misc-scripts/gifts_import/
348 
349 
350 eg:
351 (with -dryrun flag won't insert)
352 perl populate_protein_feature.pl -gfhost localhost -gfport 3306 -gfuser prem -gfpass prem -gfdb carlos_ensembl_gifts -pfhost localhost -pfport 3306 -pfuser prem -pfpass prem -pfdb homo_sapiens_core_91_38 -dryrun
353 
354 (without -dryrun flag creates a full version)
355 perl populate_protein_feature.pl -gfhost localhost -gfport 3306 -gfuser prem -gfpass prem -gfdb carlos_ensembl_gifts -pfhost localhost -pfport 3306 -pfuser prem -pfpass prem -pfdb homo_sapiens_core_91_38
356 
357 
358 Usage:
359 
360  $0 -gfhost giftsdb_host -gfport giftsdb_port -gfuser giftsdb_user -gfpass giftsdb_pass -gfdb giftsdb_dbname -pfhost protfeatdb_host -pfport protfeatdb_port -pfuser protfeatdb_user -pfpass protfeatdb_pass -pfdb protfeatdb_dbname
361  $indent [-help]
362 
363  -gfhost GIFTS database host
364 
365  -gfport GIFTS database port
366 
367  -gfuser GIFTS database user
368 
369  -gfpass GIFTS database password
370 
371  -gfdb GIFTS database name
372 
373  -pfhost core database host
374 
375  -pfport core database port
376 
377  -pfuser core database user
378 
379  -pfpass core database pass
380 
381  -pfdb core database name
382 
383  -limit limit the number of rows to inserts - used only while testing
384 
385  -dryrun dryrun, no insert
386 
387  -help This message
388 
389 
390 EOF
391 
392 }
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
get_dba
public get_dba()
populate_protein_feature_db
public populate_protein_feature_db()
build_insert_sql
public build_insert_sql()
usage
public usage()
get_analysis_id
public get_analysis_id()
Bio::EnsEMBL::Registry
Definition: Registry.pm:113
init_db
public init_db()
run
public run()
db_connect
public db_connect()
annotate_ref
public annotate_ref()
get_adaptor
public get_adaptor()
get_registry
public get_registry()