ensembl-hive  2.8.1
HPAParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 =head1 NAME
21 
23 
24 =head1 DESCRIPTION
25 
26 This parser will read and creates direct xrefs from a simple comma-delimited file downloaded from the Human Protein Atlas (HPA) database.
27 The database contains two types of antibody, their own HPA antibodies and Collaborator antibody (CAB) commercial antibodies.
28 
29  data_uri = http://www.proteinatlas.org/download/xref.php
30 
31 The columns of the file should be the following:
32 
33  1) Antibody
34  2) Antibody ID
35  3) Ensembl Peptide ID
36  4) Link (URL)
37 
38  Antibody,antibody_id,ensembl_peptide_id,link
39  CAB000001,1,ENSP00000363822,http://www.proteinatlas.org/ENSG00000169083-AR
40  CAB000001,1,ENSP00000379358,http://www.proteinatlas.org/ENSG00000169083-AR
41 
42 =head1 SYNOPSIS
43 
44  my $parser = XrefParser::HPAParser->new($db->dbh);
45  $parser->run({
46  source_id => 11,
47  species_id => 9606,
48  files => ["hpa.txt"],
49  });
50 
51 =cut
52 
53 package XrefParser::HPAParser;
54 
55 use strict;
56 use warnings;
57 
58 use Carp;
59 use Text::CSV;
60 
61 use parent qw( XrefParser::BaseParser);
62 
63 my $EXPECTED_NUMBER_OF_COLUMNS = 4;
64 
65 
66 
67 =head2
68 The run method does the actual parsing and creation of direct xrefs.
69 Parser gets initialized as noted above and run is called from
70 Bio::EnsEMBL::Production::Pipeline::Xrefs::ParseSource
71 
72 my $parser = XrefParser::HPAParser->new($db->dbh);
73 $parser->run(...);
74 
75 =cut
76 
77 sub run {
78  my ( $self, $ref_arg ) = @_;
79  my $source_id = $ref_arg->{source_id};
80  my $species_id = $ref_arg->{species_id};
81  my $files = $ref_arg->{files};
82  my $verbose = $ref_arg->{verbose} // 0;
83  my $dbi = $ref_arg->{dbi} // $self->dbi;
84 
85  if ( ( !defined $source_id ) or
86  ( !defined $species_id ) or
87  ( !defined $files ) ) {
88  confess 'Need to pass source_id, species_id, and files';
89  }
90 
91  my $file = @{$files}[0];
92 
93  my $file_io = $self->get_filehandle($file);
94  if ( !defined $file_io ) {
95  confess "Could not open $file\n";
96  }
97 
98  my $input_file = Text::CSV->new({
99  sep_char => q{,},
100  empty_is_undef => 1,
101  strict => 1,
102  }) or confess "Cannot use file $file: " . Text::CSV->error_diag();
103 
104  if ( ! is_file_header_valid( $input_file->header( $file_io ) ) ) {
105  confess "Malformed or unexpected header in HPA file '${file}'";
106  }
107 
108  my $parsed_count = 0;
109  while ( my $data = $input_file->getline($file_io) ) {
110  my ( $antibody_name, $antibody_id, $ensembl_id ) = @{ $data };
111 
112  $self->add_to_direct_xrefs({
113  acc => $antibody_id,
114  version => '1',
115  label => $antibody_name,
116  stable_id => $ensembl_id,
117  type => 'translation',
118  source_id => $source_id,
119  species_id => $species_id,
120  info_type => 'DIRECT'
121  });
122 
123  ++$parsed_count;
124  } ## end while
125 
126  $input_file->eof or
127  confess "Error parsing file $file: " . $input_file->error_diag();
128  $file_io->close();
129 
130  if ($verbose) {
131  printf( "%d direct xrefs succesfully parsed\n", $parsed_count );
132  }
133 
134  return 0;
135 } ## end sub run
136 
137 
138 =head2 is_file_header_valid
139 
140  Arg [1..N] : list of column names provided by Text::CSV::getline()
141  Example : if ( ! is_file_header_valid( $csv->getline( $fh ) ) {
142  confess 'Bad header';
143  }
144  Description: Verifies if the header of a HPA file follows expected
145  syntax.
146  Return type: boolean
147  Exceptions : none
148  Caller : internal
149  Status : Stable
150 
151 =cut
152 
153 sub is_file_header_valid {
154  my ( @header ) = @_;
155 
156  # Don't bother with parsing column names if their number does not
157  # match to begin with
158  if ( scalar @header != $EXPECTED_NUMBER_OF_COLUMNS ) {
159  return 0;
160  }
161 
162  my @field_patterns
163  = (
164  qr{ antibody }msx,
165  qr{ antibody_id }msx,
166  qr{ ensembl_peptide_id }msx,
167  qr{ link }msx,
168  );
169 
170  my $header_field;
171  foreach my $pattern (@field_patterns) {
172  $header_field = shift @header;
173  # Make sure we run the regex match in scalar context
174  return 0 unless scalar ( $header_field =~ m{ $pattern }msx );
175  }
176 
177  # If we have made it this far, all should be in order
178  return 1;
179 }
180 
181 
182 1;
XrefParser::BaseParser
Definition: BaseParser.pm:8
XrefParser::HPAParser::run
public run()
run
public run()
XrefParser::HPAParser
Definition: HPAParser.pm:34