2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
17 #The script populates the CORE db's protein_feature table with all the uniprot mappings found in GIFTS db's ensp_u_cigar table.
22 use DBI qw( :sql_types );
27 my ( $gfhost, $gfport, $gfuser, $gfpass, $gfdb );
28 my ( $pfhost, $pfport, $pfuser, $pfpass, $pfdb );
36 "gfhost=s" => \$gfhost,
37 "gfport=i" => \$gfport,
38 "gfuser=s" => \$gfuser,
39 "gfpass=s" => \$gfpass,
42 "pfhost=s" => \$pfhost,
43 "pfport=i" => \$pfport,
44 "pfuser=s" => \$pfuser,
45 "pfpass=s" => \$pfpass,
49 "dryrun!" => \$dryrun,
54 usage() if ( !defined $gfhost || !defined $gfport || !defined $gfuser || !defined $gfdb );
55 usage() if ( !defined $pfhost || !defined $pfport || !defined $pfuser || !defined $pfpass );
57 my ( $giftsDB, $protfeatureDB ) =
init_db();
58 die unless defined($giftsDB) and defined($protfeatureDB);
60 print "giftsDB credentials:\n";
61 print Dumper($giftsDB);
63 print "protfeatureDB credentials:\n";
64 print Dumper($protfeatureDB);
67 my $translation_adaptor = $registry->
get_adaptor( 'human', 'core', 'translation' );
74 # This is where the gifts info will be read from
84 # This is where the gifts info will be written to
93 return ( $giftsDB, $protfeatureDB );
96 #get the analysis_id for gifts_import
98 my ($logic_name) = @_;
101 my $dba =
get_dba($protfeatureDB);
102 my $analysis_dba = $dba->get_AnalysisAdaptor();
103 my $analysis = $analysis_dba->fetch_by_logic_name($logic_name);
105 $analysis_id = $analysis->dbID()
if defined $analysis;
113 my $protfeature_dbh =
db_connect($protfeatureDB);
116 print
"Got connection to GiftsDB \n";
119 if($protfeature_dbh){
120 print
"Got connection to ProtfeatureDB \n";
123 #container to hold the sql statements in batches (default: 1000)
126 my $rows_inserted = 0;
128 my @failed_stmts = undef;
132 foreach my $insert_stmt (@$insert_container) {
134 my $affected = $protfeature_dbh->do($insert_stmt);
135 $rows_inserted += $affected;
136 print
"Affeced $affected Rows inserted $rows_inserted\n";
141 push(@failed_stmts, $@);
145 print
"FAILED STATEMENTS\n";
146 print Dumper(\@failed_stmts);
147 print
"INSERT END\n";
150 print
"Number of rows inserted into protein_feature $protfeatureDB->{'dbname'}.protein_feature ", $rows_inserted,
"\n";
151 print
"Number of rows with insertion failed into protein_feature $protfeatureDB->{'dbname'}.protein_feature ", $rows_failed,
"\n";
153 $gifts_dbh->disconnect();
154 $protfeature_dbh->disconnect();
159 #annotate the protein feature object with translation dbID, align_type along with other info fetched from giftsdb
164 my $protein_feature = undef;
165 my $ensp_id = $ref->[0];
166 my $ensp = $translation_adaptor->fetch_by_stable_id( $ensp_id ) ;
170 $protein_feature->{
'translation_id'} = $ensp->dbID; # ENSP dbID
171 $protein_feature->{
'seq_start'} = 1;
172 $protein_feature->{
'seq_end'} = $ensp->length() - 1;
173 $protein_feature->{
'hit_start'} = 1;
174 $protein_feature->{
'hit_end'} = $ensp->length() - 1;
175 $protein_feature->{
'hit_name'} = $ref->[3];
176 $protein_feature->{
'analysis_id'} = $analysis_id;
177 $protein_feature->{
'cigar_line'} = $ref->[5];
178 $protein_feature->{
'align_type'} =
'mdtag';
181 return $protein_feature;
184 #builds the multi-row insert statements in batch mode
186 my ($gifts_dbh, $protfeature_dbh) = @_;
190 print(
"Analysis id $analysis_id \n");
192 my $select_sql = qq{SELECT DISTINCT ensp_id, 0, 0, concat_ws(
'.',uniprot_acc, uniprot_seq_version), $analysis_id as analysis_id, mdz,
'mdtag' FROM ensp_u_cigar};
193 $select_sql = $limit ? $select_sql .
" limit $limit" : $select_sql;
195 my $insert_sql = qq{INSERT INTO $protfeatureDB->{
'dbname'}.protein_feature(translation_id, seq_start, seq_end, hit_start, hit_end, hit_name, analysis_id, cigar_line, align_type) VALUES };
197 my $import_sth = $gifts_dbh->prepare($select_sql);
198 $import_sth->execute;
199 my $rows = $import_sth->rows;
203 my $max_rows_in_insert = 999; #Batch size of 1000
204 my $rows_in_insert = 0;
206 my @insert_container = ();
209 while ( my $ref = $import_sth->fetchrow_arrayref ) {
211 if($ref->[0] !~ /^ENSP.*/){
212 warn
"\t\t ensp id not in right format for ", $ref->[0],
" Uniprot id ", $ref->[3],
"\n";
217 #add additinal protein_feture annotations to the ref object
220 unless($protein_feature){
221 warn
"No translation record found in core translation for ", $ref->[0],
" Uniprot id ", $ref->[3],
"\n";
226 #multi-row insertions limited by the batch size
227 if ( $rows_in_insert < $max_rows_in_insert ) {
228 $insert_values .=
',' if $start++;
229 $insert_values .=
'(' . $gifts_dbh->quote($protein_feature->{
'translation_id'}) .
',' .
230 $gifts_dbh->quote($protein_feature->{
'seq_start'}) .
',' .
231 $gifts_dbh->quote($protein_feature->{
'seq_end'}) .
',' .
232 $gifts_dbh->quote($protein_feature->{
'hit_start'}) .
',' .
233 $gifts_dbh->quote($protein_feature->{
'hit_end'}) .
',' .
234 $gifts_dbh->quote($protein_feature->{
'hit_name'}) .
',' .
235 $gifts_dbh->quote($protein_feature->{
'analysis_id'}) .
',' .
236 $gifts_dbh->quote($protein_feature->{
'cigar_line'}) .
',' .
237 $gifts_dbh->quote($protein_feature->{
'align_type'}) .
')';
241 $insert_values .=
',' if $start++;
242 $insert_values .=
'(' . $gifts_dbh->quote($protein_feature->{
'translation_id'}) .
',' .
243 $gifts_dbh->quote($protein_feature->{
'seq_start'}) .
',' .
244 $gifts_dbh->quote($protein_feature->{
'seq_end'}) .
',' .
245 $gifts_dbh->quote($protein_feature->{
'hit_start'}) .
',' .
246 $gifts_dbh->quote($protein_feature->{
'hit_end'}) .
',' .
247 $gifts_dbh->quote($protein_feature->{
'hit_name'}) .
',' .
248 $gifts_dbh->quote($protein_feature->{
'analysis_id'}) .
',' .
249 $gifts_dbh->quote($protein_feature->{
'cigar_line'}) .
',' .
250 $gifts_dbh->quote($protein_feature->{
'align_type'}) .
')';
252 my $insert_sql_with_values = $insert_sql . $insert_values;
253 push( @insert_container, $insert_sql_with_values );
255 $insert_values = undef;
261 #if the total number of rows is less than the batch size, you will reach here
262 if ($insert_values) {
263 my $insert_sql_with_values = $insert_sql . $insert_values;
264 push( @insert_container, $insert_sql_with_values );
267 print
"Number of rows fetched from GIFTS $rows \n";
268 print
"Number of rows missed from GIFTS $miss \n";
269 print
"Number of rows with wrong ENSP assignment from GIFTS $not_ensp \n";
271 return \@insert_container;
275 #===========DB Connection routines=============
277 my ($connectDB) = @_;
279 my $dbname = $connectDB->{
'dbname'};
280 my $host = $connectDB->{
'host'};
281 my $user = $connectDB->{
'user'};
282 my $pass = $connectDB->{
'pass'};
283 my $port = $connectDB->{
'port'};
285 my $dsn =
"DBI:mysql:host=$host;";
287 $dsn .=
"port=$port;";
289 $dsn .=
"database=$dbname";
291 my $dbh = DBI->connect( $dsn, $user, $pass,
292 {
'PrintError' => 1,
'RaiseError' => 1 } );
295 die
"ERROR: $DBI::errstr";
302 my ($connectDB) = @_;
304 my $dbname = $connectDB->{
'dbname'};
305 my $host = $connectDB->{
'host'};
306 my $user = $connectDB->{
'user'};
307 my $pass = $connectDB->{
'pass'};
308 my $port = $connectDB->{
'port'};
322 my $registry =
"Bio::EnsEMBL::Registry";
324 $registry->load_registry_from_db(
325 -host => $protfeatureDB->{
'host'},
326 -user => $protfeatureDB->{
'user'},
327 -pass => $protfeatureDB->{
'pass'},
328 -port => $protfeatureDB->{
'port'}
337 my $indent =
' ' x length($0);
338 print <<
'EOF'; exit(0);
340 The script populates the CORE db
's protein_feature table with all the uniprot mappings found in GIFTS db's ensp_u_cigar table.
343 Options -gfhost -gfport -gfuser -gfpass -gfdb are mandatory and specify the credentials
for the server on which a GIFTS database exists
344 Options -pfhost -pfport -pfuser -pfpass -pfdb are credentials of the core db where the GIFTS data will be copied into
346 To
run the script cd into the directory where the script lives and
run the script:
347 cd ensembl/misc-scripts/gifts_import/
351 (with -dryrun flag won
't insert)
352 perl populate_protein_feature.pl -gfhost localhost -gfport 3306 -gfuser prem -gfpass prem -gfdb carlos_ensembl_gifts -pfhost localhost -pfport 3306 -pfuser prem -pfpass prem -pfdb homo_sapiens_core_91_38 -dryrun
354 (without -dryrun flag creates a full version)
355 perl populate_protein_feature.pl -gfhost localhost -gfport 3306 -gfuser prem -gfpass prem -gfdb carlos_ensembl_gifts -pfhost localhost -pfport 3306 -pfuser prem -pfpass prem -pfdb homo_sapiens_core_91_38
360 $0 -gfhost giftsdb_host -gfport giftsdb_port -gfuser giftsdb_user -gfpass giftsdb_pass -gfdb giftsdb_dbname -pfhost protfeatdb_host -pfport protfeatdb_port -pfuser protfeatdb_user -pfpass protfeatdb_pass -pfdb protfeatdb_dbname
363 -gfhost GIFTS database host
365 -gfport GIFTS database port
367 -gfuser GIFTS database user
369 -gfpass GIFTS database password
371 -gfdb GIFTS database name
373 -pfhost core database host
375 -pfport core database port
377 -pfuser core database user
379 -pfpass core database pass
381 -pfdb core database name
383 -limit limit the number of rows to inserts - used only while testing
385 -dryrun dryrun, no insert