ensembl-hive  2.7.0
DBASSParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 package XrefParser::DBASSParser;
21 
22 # For non-destructive substitutions in regexps (/r flag)
23 require 5.014_000;
24 
25 use strict;
26 use warnings;
27 
28 use Carp;
29 use Readonly;
30 use Text::CSV;
31 
32 use parent qw( XrefParser::BaseParser );
33 
34 
35 Readonly my $EXPECTED_NUMBER_OF_COLUMNS => 23;
36 
37 
38 
39 =head2 run
40 
41  Arg [1] : HashRef standard list of arguments from ParseSource
42  Example : $dbass_parser->run({ ... });
43  Description: Extract DBASS3/DBASS5 entries from a comma-delimited
44  file downloaded from the DBASS Web site, then insert
45  corresponding xrefs and gene_direct_xref links into the
46  xref database.
47 
48  The columns of the file should be the following:
49  1) DBASS Gene ID
50  2) DBASS Gene Name
51  3) DBASS Gene Description
52  4) Ensembl Gene ID
53  with the first line containing column names and all
54  subsequent ones containing entries proper. All column
55  values, including names from the header as well as any
56  empty strings, can be surrounded by pairs of double
57  quotes.
58 
59  DBASS Gene Name can be either a single name, a
60  'name/synonym' pair, or a 'name (synonym)' pair.
61 
62  Ensembl Gene ID can be an empty string, indicating an
63  unmapped entry.
64 
65  Return type: none
66  Exceptions : throws on all processing errors
67  Caller : ParseSource in the xref pipeline
68  Status : Stable
69 
70 =cut
71 
72 sub run {
73  my ( $self, $ref_arg ) = @_;
74  my $source_id = $ref_arg->{source_id};
75  my $species_id = $ref_arg->{species_id};
76  my $files = $ref_arg->{files};
77  my $verbose = $ref_arg->{verbose} // 0;
78  my $dbi = $ref_arg->{dbi} // $self->dbi;
79 
80  if ( ( !defined $source_id ) or
81  ( !defined $species_id ) or
82  ( !defined $files ) )
83  {
84  croak 'Need to pass source_id, species_id and files as pairs';
85  }
86  my $csv = Text::CSV->new()
87  || confess 'Failed to initialise CSV parser: ' . Text::CSV->error_diag();
88 
89  my $filename = @{$files}[0];
90 
91  my $file_io = $self->get_filehandle($filename);
92  if ( !defined($file_io) ) {
93  confess "Failed to acquire a file handle for '${filename}'";
94  }
95 
96  if ( ! is_file_header_valid( $csv->header( $file_io ) ) ) {
97  confess "Malformed or unexpected header in DBASS file '${filename}'";
98  }
99 
100  my $processed_count = 0;
101  my $unmapped_count = 0;
102 
103  while ( defined( my $line = $csv->getline( $file_io ) ) ) {
104 
105  if ( scalar @{ $line } < $EXPECTED_NUMBER_OF_COLUMNS ) {
106  confess 'Line ' . (2 + $processed_count + $unmapped_count)
107  . " of input file '${filename}' has an incorrect number of columns";
108  }
109 
110  # Do not modify the contents of @{$line}, only the output - hence the /r.
111  my ( $dbass_gene_id, $dbass_gene_name, $dbass_full_name, $ensembl_id )
112  = map { s{\s+\z}{}rmsx } @{ $line };
113 
114  # Do not attempt to create unmapped xrefs. Checking truthiness is good
115  # enough here because the only non-empty string evaluating as false is
116  # not a valid Ensembl stable ID.
117  if ( $ensembl_id ) {
118 
119  # DBASS files list synonyms in two ways: either "FOO (BAR)" (with or
120  # without space) or "FOO/BAR". Both forms are relevant to us.
121  my ( $first_gene_name, $second_gene_name );
122  if ( ( $dbass_gene_name =~ m{
123  (.*)
124  \s?\/\s? # typically no ws here but just in case
125  (.*)
126  }msx ) ||
127  ( $dbass_gene_name =~ m{
128  (.*)
129  \s? # there are entries both with and without ws
130  [(] (.*) [)]
131  }msx ) ) {
132  $first_gene_name = $1;
133  $second_gene_name = $2;
134  }
135  else {
136  $first_gene_name = $dbass_gene_name;
137  $second_gene_name = undef;
138  }
139 
140  my $label = $first_gene_name;
141  my $synonym = $second_gene_name;
142  my $type = 'gene';
143  my $version = '1';
144 
145  my $xref_id =
146  $self->get_xref( $dbass_gene_id, $source_id, $species_id, $dbi );
147 
148  if ( ( ! defined $xref_id ) || ( $xref_id eq q{} ) ) {
149  $xref_id = $self->add_xref({
150  acc => $dbass_gene_id,
151  version => $version,
152  label => $label,
153  source_id => $source_id,
154  dbi => $dbi,
155  species_id => $species_id,
156  info_type => 'DIRECT'
157  });
158  }
159 
160  if ( defined($synonym) ) {
161  $self->add_synonym( $xref_id, $synonym, $dbi );
162  }
163 
164  $self->add_direct_xref( $xref_id, $ensembl_id, $type, undef, $dbi );
165 
166  ++$processed_count;
167  }
168  else {
169  ++$unmapped_count;
170  }
171 
172  } ## end while ( defined( my $line...))
173 
174  $csv->eof;
175  $file_io->close();
176 
177  if ($verbose) {
178  printf( "%d direct xrefs succesfully processed\n", $processed_count );
179  printf( "Skipped %d unmapped xrefs\n", $unmapped_count );
180  }
181 
182  return 0;
183 } ## end sub run
184 
185 
186 =head2 is_file_header_valid
187 
188  Arg [1..N] : list of column names provided by Text::CSV::header()
189  Example : if ( !is_file_header_valid( $csv->header( $fh ) ) ) {
190  confess 'Bad header';
191  }
192  Description: Verifies if the header of a DBASS file follows expected
193  syntax and contains expected column names.
194  Return type: boolean
195  Exceptions : none
196  Caller : internal
197  Status : Stable
198 
199 =cut
200 
201 sub is_file_header_valid {
202  my ( @header ) = @_;
203 
204  # Don't bother with parsing column names if their number does not
205  # match to begin with
206  if ( scalar @header < $EXPECTED_NUMBER_OF_COLUMNS ) {
207  return 0;
208  }
209 
210  my $dbass_end = ( $header[0] eq 'id' );
211  return 0 unless $dbass_end;
212 
213  my $dbass_name_ok = ( $header[1] eq 'genesymbol' );
214  return 0 unless $dbass_name_ok;
215 
216  my $ensembl_id_ok = ( $header[3] eq 'ensemblreference' );
217  return 0 unless $ensembl_id_ok;
218 
219  # If we have made it this far, all should be in order
220  return 1;
221 }
222 
223 
224 1;
map
public map()
XrefParser::BaseParser
Definition: BaseParser.pm:8
run
public run()