my ($gifts_dbh, $protfeature_dbh) = @_;
#get analysis id
print("Analysis id $analysis_id \n");
my $select_sql = qq{SELECT DISTINCT ensp_id, 0, 0, concat_ws('.',uniprot_acc, uniprot_seq_version), $analysis_id as analysis_id, mdz, 'mdtag' FROM ensp_u_cigar};
$select_sql = $limit ? $select_sql . " limit $limit" : $select_sql;
my $insert_sql = qq{INSERT INTO $protfeatureDB->{'dbname'}.protein_feature(translation_id, seq_start, seq_end, hit_start, hit_end, hit_name, analysis_id, cigar_line, align_type) VALUES };
my $import_sth = $gifts_dbh->prepare($select_sql);
$import_sth->execute;
my $rows = $import_sth->rows;
my $start = 0;
my $insert_values;
my $max_rows_in_insert = 999; #Batch size of 1000
my $rows_in_insert = 0;
my @insert_container = ();
my $miss = 0;
my $not_ensp =0;
while ( my $ref = $import_sth->fetchrow_arrayref ) {
if($ref->[0] !~ /^ENSP.*/){
warn "\t\t ensp id not in right format for ", $ref->[0], " Uniprot id ", $ref->[3], "\n";
$not_ensp++;
next;
}
#add additinal protein_feture annotations to the ref object
unless($protein_feature){
warn "No translation record found in core translation for ", $ref->[0], " Uniprot id ", $ref->[3], "\n";
$miss++;
next;
}
#multi-row insertions limited by the batch size
if ( $rows_in_insert < $max_rows_in_insert ) {
$insert_values .= ',' if $start++;
$insert_values .= '(' . $gifts_dbh->quote($protein_feature->{'translation_id'}) . ',' .
$gifts_dbh->quote($protein_feature->{'seq_start'}) . ',' .
$gifts_dbh->quote($protein_feature->{'seq_end'}) . ',' .
$gifts_dbh->quote($protein_feature->{'hit_start'}) . ',' .
$gifts_dbh->quote($protein_feature->{'hit_end'}) . ',' .
$gifts_dbh->quote($protein_feature->{'hit_name'}) . ',' .
$gifts_dbh->quote($protein_feature->{'analysis_id'}) . ',' .
$gifts_dbh->quote($protein_feature->{'cigar_line'}) . ',' .
$gifts_dbh->quote($protein_feature->{'align_type'}) . ')';
$rows_in_insert++;
}
else {
$insert_values .= ',' if $start++;
$insert_values .= '(' . $gifts_dbh->quote($protein_feature->{'translation_id'}) . ',' .
$gifts_dbh->quote($protein_feature->{'seq_start'}) . ',' .
$gifts_dbh->quote($protein_feature->{'seq_end'}) . ',' .
$gifts_dbh->quote($protein_feature->{'hit_start'}) . ',' .
$gifts_dbh->quote($protein_feature->{'hit_end'}) . ',' .
$gifts_dbh->quote($protein_feature->{'hit_name'}) . ',' .
$gifts_dbh->quote($protein_feature->{'analysis_id'}) . ',' .
$gifts_dbh->quote($protein_feature->{'cigar_line'}) . ',' .
$gifts_dbh->quote($protein_feature->{'align_type'}) . ')';
my $insert_sql_with_values = $insert_sql . $insert_values;
push( @insert_container, $insert_sql_with_values );
$insert_values = undef;
$rows_in_insert = 0;
$start = 0;
}
}
#if the total number of rows is less than the batch size, you will reach here
if ($insert_values) {
my $insert_sql_with_values = $insert_sql . $insert_values;
push( @insert_container, $insert_sql_with_values );
}
print "Number of rows fetched from GIFTS $rows \n";
print "Number of rows missed from GIFTS $miss \n";
print "Number of rows with wrong ENSP assignment from GIFTS $not_ensp \n";
return \@insert_container;
}