3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
20 package XrefParser::DBASSParser;
22 # For non-destructive substitutions in regexps (/r flag)
35 Readonly my $EXPECTED_NUMBER_OF_COLUMNS => 23;
41 Arg [1] : HashRef standard list of arguments from ParseSource
42 Example : $dbass_parser->run({ ... });
43 Description: Extract DBASS3/DBASS5 entries from a comma-delimited
44 file downloaded from the DBASS Web site, then insert
45 corresponding xrefs and gene_direct_xref links into the
48 The columns of the file should be the following:
51 3) DBASS Gene Description
53 with the first line containing column names and all
54 subsequent ones containing entries proper. All column
55 values, including names from the header as well as any
56 empty strings, can be surrounded by pairs of
double
59 DBASS Gene Name can be either a single name, a
60 'name/synonym' pair, or a
'name (synonym)' pair.
62 Ensembl Gene ID can be an empty string, indicating an
66 Exceptions :
throws on all processing errors
67 Caller : ParseSource in the xref pipeline
73 my ( $self, $ref_arg ) = @_;
74 my $source_id = $ref_arg->{source_id};
75 my $species_id = $ref_arg->{species_id};
76 my $files = $ref_arg->{files};
77 my $verbose = $ref_arg->{verbose}
78 my $dbi = $ref_arg->{dbi}
80 if ( ( !defined $source_id ) or
81 ( !defined $species_id ) or
84 croak
'Need to pass source_id, species_id and files as pairs';
86 my $csv = Text::CSV->new()
87 || confess
'Failed to initialise CSV parser: ' . Text::CSV->error_diag();
89 my $filename = @{$files}[0];
91 my $file_io = $self->get_filehandle($filename);
92 if ( !defined($file_io) ) {
93 confess
"Failed to acquire a file handle for '${filename}'";
96 if ( ! is_file_header_valid( $csv->header( $file_io ) ) ) {
97 confess
"Malformed or unexpected header in DBASS file '${filename}'";
100 my $processed_count = 0;
101 my $unmapped_count = 0;
103 while ( defined( my $line = $csv->getline( $file_io ) ) ) {
105 if ( scalar @{ $line } < $EXPECTED_NUMBER_OF_COLUMNS ) {
106 confess
'Line ' . (2 + $processed_count + $unmapped_count)
107 .
" of input file '${filename}' has an incorrect number of columns";
110 # Do not modify the contents of @{$line}, only the output - hence the /r.
111 my ( $dbass_gene_id, $dbass_gene_name, $dbass_full_name, $ensembl_id )
112 =
map { s{\s+\z}{}rmsx } @{ $line };
114 # Do not attempt to create unmapped xrefs. Checking truthiness is good
115 # enough here because the only non-empty string evaluating as false is
116 # not a valid Ensembl stable ID.
119 # DBASS files list synonyms in two ways: either "FOO (BAR)" (with or
120 # without space) or "FOO/BAR". Both forms are relevant to us.
121 my ( $first_gene_name, $second_gene_name );
122 if ( ( $dbass_gene_name =~ m{
124 \s?\/\s? # typically no ws here but just in
case
127 ( $dbass_gene_name =~ m{
129 \s? # there are entries both with and without ws
132 $first_gene_name = $1;
133 $second_gene_name = $2;
136 $first_gene_name = $dbass_gene_name;
137 $second_gene_name = undef;
140 my $label = $first_gene_name;
141 my $synonym = $second_gene_name;
146 $self->get_xref( $dbass_gene_id, $source_id, $species_id, $dbi );
148 if ( ( ! defined $xref_id ) || ( $xref_id eq q{} ) ) {
149 $xref_id = $self->add_xref({
150 acc => $dbass_gene_id,
153 source_id => $source_id,
155 species_id => $species_id,
156 info_type =>
'DIRECT'
160 if ( defined($synonym) ) {
161 $self->add_synonym( $xref_id, $synonym, $dbi );
164 $self->add_direct_xref( $xref_id, $ensembl_id, $type, undef, $dbi );
172 } ## end
while ( defined( my $line...))
178 printf(
"%d direct xrefs succesfully processed\n", $processed_count );
179 printf(
"Skipped %d unmapped xrefs\n", $unmapped_count );
186 =head2 is_file_header_valid
188 Arg [1..N] : list of column names provided by Text::CSV::header()
189 Example : if ( !is_file_header_valid( $csv->header( $fh ) ) ) {
190 confess
'Bad header';
192 Description: Verifies
if the header of a DBASS file follows expected
193 syntax and contains expected column names.
201 sub is_file_header_valid {
204 # Don't bother with parsing column names if their number does not
205 # match to begin with
206 if ( scalar @header < $EXPECTED_NUMBER_OF_COLUMNS ) {
210 my $dbass_end = ( $header[0] eq
'id' );
211 return 0 unless $dbass_end;
213 my $dbass_name_ok = ( $header[1] eq
'genesymbol' );
214 return 0 unless $dbass_name_ok;
216 my $ensembl_id_ok = ( $header[3] eq
'ensemblreference' );
217 return 0 unless $ensembl_id_ok;
219 # If we have made it this far, all should be in order