3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
26 This parser will read and creates direct xrefs from a simple comma-delimited file downloaded from the Human Protein Atlas (HPA) database.
27 The database contains two types of antibody, their own HPA antibodies and Collaborator antibody (CAB) commercial antibodies.
31 The columns of the file should be the following:
38 Antibody,antibody_id,ensembl_peptide_id,link
39 CAB000001,1,ENSP00000363822,http:
40 CAB000001,1,ENSP00000379358,http:
53 package XrefParser::HPAParser;
63 my $EXPECTED_NUMBER_OF_COLUMNS = 4;
68 The
run method does the actual parsing and creation of direct xrefs.
69 Parser gets initialized as noted above and
run is called from
70 Bio::EnsEMBL::Production::Pipeline::Xrefs::ParseSource
78 my ( $self, $ref_arg ) = @_;
79 my $source_id = $ref_arg->{source_id};
80 my $species_id = $ref_arg->{species_id};
81 my $files = $ref_arg->{files};
82 my $verbose = $ref_arg->{verbose}
83 my $dbi = $ref_arg->{dbi}
85 if ( ( !defined $source_id ) or
86 ( !defined $species_id ) or
87 ( !defined $files ) ) {
88 confess
'Need to pass source_id, species_id, and files';
91 my $file = @{$files}[0];
93 my $file_io = $self->get_filehandle($file);
94 if ( !defined $file_io ) {
95 confess
"Could not open $file\n";
98 my $input_file = Text::CSV->new({
102 }) or confess
"Cannot use file $file: " . Text::CSV->error_diag();
104 if ( ! is_file_header_valid( $input_file->header( $file_io ) ) ) {
105 confess
"Malformed or unexpected header in HPA file '${file}'";
108 my $parsed_count = 0;
109 while ( my $data = $input_file->getline($file_io) ) {
110 my ( $antibody_name, $antibody_id, $ensembl_id ) = @{ $data };
112 $self->add_to_direct_xrefs({
115 label => $antibody_name,
116 stable_id => $ensembl_id,
117 type =>
'translation',
118 source_id => $source_id,
119 species_id => $species_id,
120 info_type =>
'DIRECT'
127 confess
"Error parsing file $file: " . $input_file->error_diag();
131 printf(
"%d direct xrefs succesfully parsed\n", $parsed_count );
138 =head2 is_file_header_valid
140 Arg [1..N] : list of column names provided by Text::CSV::getline()
141 Example : if ( ! is_file_header_valid( $csv->getline( $fh ) ) {
142 confess
'Bad header';
144 Description: Verifies
if the header of a HPA file follows expected
153 sub is_file_header_valid {
156 # Don't bother with parsing column names if their number does not
157 # match to begin with
158 if ( scalar @header != $EXPECTED_NUMBER_OF_COLUMNS ) {
165 qr{ antibody_id }msx,
166 qr{ ensembl_peptide_id }msx,
171 foreach my $pattern (@field_patterns) {
172 $header_field = shift @header;
173 # Make sure we run the regex match in scalar context
174 return 0 unless scalar ( $header_field =~ m{ $pattern }msx );
177 # If we have made it this far, all should be in order