ensembl-hive  2.7.0
Mim2GeneParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 package XrefParser::Mim2GeneParser;
21 
22 # For non-destructive substitutions in regexps (/r flag)
23 require 5.014_000;
24 
25 use strict;
26 use warnings;
27 
28 use Carp;
29 use List::Util;
30 use Text::CSV;
31 
32 use parent qw( XrefParser::BaseParser );
33 
34 
35 my $EXPECTED_NUMBER_OF_COLUMNS = 6;
36 
37 =head2 run
38 
39  Arg [1] : HashRef standard list of arguments from ParseSource
40  Example : $m2g_parser->run({ ... });
41  Description: Extract mappings between OMIM genes and other gene
42  identifiers from a tab-delimited file downloaded from
43  the DBASS Web site, then insert corresponding links
44  into the xref database:
45  - for entries mapped to Ensembl genes, we create
46  gene_direct_xref links;
47  - otherwise, if an entry is mapped to an EntrezGene ID
48  that exists in the xref database we creare a
49  dependent_xref link.
50  In either case we update info_type of OMIM xrefs
51  accordingly.
52 
53  DEPENDENCIES: This parser must be run after:
54  - MIMParser - without existing OMIM entries this
55  parser does nothing;
56  - EntrezGeneParser - otherwise there will be no
57  dependent-xref links.
58 
59  mim2gene_medgen file begins with a header line with
60  6 tab separated columns:
61  #MIM number GeneID type Source MedGenCUI Comment
62  MIM number and GeneID are considered for the xrefs.
63 
64  Return type: none
65  Exceptions : throws on all processing errors
66  Caller : ParseSource in the xref pipeline
67  Status : Stable
68 
69 =cut
70 
71 sub run {
72 
73  my ( $self, $ref_arg ) = @_;
74  my $general_source_id = $ref_arg->{source_id};
75  my $species_id = $ref_arg->{species_id};
76  my $files = $ref_arg->{files};
77  my $verbose = $ref_arg->{verbose} // 0;
78  my $dbi = $ref_arg->{dbi} // $self->dbi;
79 
80  if ( ( !defined $general_source_id ) or
81  ( !defined $species_id ) or
82  ( !defined $files ) )
83  {
84  confess "Need to pass source_id, species_id and files as pairs";
85  }
86 
87  my $csv = Text::CSV->new({
88  sep_char => "\t",
89  })
90  || confess 'Failed to initialise CSV parser: ' . Text::CSV->error_diag();
91 
92  my $filename = @{$files}[0];
93 
94  my $m2g_io = $self->get_filehandle($filename);
95  if ( !defined $m2g_io ) {
96  confess "Could not open file '${filename}'";
97  }
98 
99  my $mim_gene_source_id =
100  $self->get_source_id_for_source_name( 'MIM_GENE', undef, $dbi );
101  my $mim_morbid_source_id =
102  $self->get_source_id_for_source_name( 'MIM_MORBID', undef, $dbi );
103  my $entrez_source_id =
104  $self->get_source_id_for_source_name( 'EntrezGene', undef, $dbi );
105 
106  # This will be used to prevent insertion of duplicates
107  $self->get_dependent_mappings( $mim_gene_source_id, $dbi );
108  $self->get_dependent_mappings( $mim_morbid_source_id, $dbi );
109 
110  # FIXME: should we abort if any of these comes back empty?
111  my (%mim_gene) =
112  %{ $self->get_valid_codes( "MIM_GENE", $species_id, $dbi ) };
113  my (%mim_morbid) =
114  %{ $self->get_valid_codes( "MIM_MORBID", $species_id, $dbi ) };
115  my (%entrez) =
116  %{ $self->get_valid_codes( "EntrezGene", $species_id, $dbi ) };
117 
118  # Initialise all counters to 0 so that we needn't handle possible undefs
119  # while printing the summary
120  my %counters = (
121  'all_entries' => 0,
122  'dependent_on_entrez' => 0,
123  'missed_master' => 0,
124  'missed_omim' => 0,
125  );
126 
127  RECORD:
128  while ( my $line = $csv->getline( $m2g_io ) ) {
129 
130  my ( $is_comment )
131  = ( $line->[0] =~ m{
132  \A
133  ([#])?
134  }msx );
135  if ( $is_comment ) {
136  # At present we identify the header line among other comments by
137  # checking if it has the expected number of tab-delimited
138  # columns, which of course means we cannot identify header lines
139  # with too few or too many column names. However, this should be
140  # mostly harmless - something would have to be very, very wrong
141  # with the input file for the header to have the wrong number of
142  # column names without a change in the number of actual columns
143  # in data rows.
144  if ( ( scalar @{ $line } == $EXPECTED_NUMBER_OF_COLUMNS )
145  && ( ! is_file_header_valid( @{ $line } ) ) ) {
146  confess "Malformed or unexpected header in Mim2Gene file '${filename}'";
147  }
148  next RECORD;
149  }
150 
151  if ( scalar @{ $line } != $EXPECTED_NUMBER_OF_COLUMNS ) {
152  confess ' Line ' . $csv->record_number()
153  . " of input file '${filename}' has an incorrect number of columns";
154  }
155 
156  # Do not modify the contents of @{$line}, only the output - hence the /r.
157  my ( $omim_acc, $entrez_id, $type, $source, $medgen, $comment )
158  = map { s{\s+\z}{}rmsx } @{ $line };
159 
160  $counters{'all_entries'}++;
161 
162  # No point in doing anything if we have no matching MIM xref...
163  if ( ( !defined $mim_gene{$omim_acc} ) &&
164  ( !defined $mim_morbid{$omim_acc} ) )
165  {
166  $counters{'missed_omim'}++;
167  next RECORD;
168  }
169 
170  # ...or no EntrezGene xref to match it to
171  if ( ( ( ! $entrez_id ) || ( ! defined $entrez{$entrez_id} ) ) ) {
172  $counters{'missed_master'}++;
173  next RECORD;
174  }
175 
176  # An unknown type might indicate the change of input format,
177  # therefore make sure the user notices it. That said, do not
178  # bother we do not have an xref this entry would operate on anyway
179  # - which is why we only check this after the preceding two
180  # presence checks.
181 
182  if ( ( $type ne 'gene')
183  && ( $type ne 'gene/phenotype' )
184  && ( $type ne 'predominantly phenotypes' )
185  && ( $type ne 'phenotype' ) ) {
186  warn "Unknown type $type for MIM Number '${omim_acc}' "
187  . "(${filename}:" . $csv->record_number() . ")";
188  }
189 
190  # With all the checks taken care of, insert the mappings. We check
191  # both MIM_GENE and MIM_MORBID every time because some MIM entries
192  # can appear in both.
193  foreach my $mim_xref_id ( @{ $mim_gene{$omim_acc} } ) {
194  $self->process_xref_entry({
195  'mim_xref_id' => $mim_xref_id,
196  'mim_source_id' => $mim_gene_source_id,
197  'entrez_xrefs' => $entrez{$entrez_id},
198  'entrez_source_id' => $entrez_source_id,
199  'counters' => \%counters,
200  'dbi' => $dbi,
201  });
202  }
203  foreach my $mim_xref_id ( @{ $mim_morbid{$omim_acc} } ) {
204  $self->process_xref_entry({
205  'mim_xref_id' => $mim_xref_id,
206  'mim_source_id' => $mim_morbid_source_id,
207  'entrez_xrefs' => $entrez{$entrez_id},
208  'entrez_source_id' => $entrez_source_id,
209  'counters' => \%counters,
210  'dbi' => $dbi,
211  });
212  }
213 
214  } ## end record loop
215 
216  $csv->eof || confess 'Error parsing CSV: ' . $csv->error_diag();
217  $m2g_io->close();
218 
219  if ( $verbose ) {
220  print 'Processed ' . $counters{'all_entries'} . " entries. Out of those\n"
221  . "\t" . $counters{'missed_omim'} . " had missing OMIM entries,\n"
222  . "\t" . $counters{'dependent_on_entrez'} . " were dependent EntrezGene xrefs,\n"
223  . "\t" . $counters{'missed_master'} . " had missing master entries.\n";
224  }
225 
226  return 0;
227 } ## end sub run
228 
229 
230 =head2 is_file_header_valid
231 
232  Arg [1..N] : list of column names provided by Text::CSV::getline()
233  Example : if ( ! is_file_header_valid( $csv->getline( $fh ) ) {
234  confess 'Bad header';
235  }
236  Description: Verifies if the header of a Mim2Gene file follows expected
237  syntax.
238  We do not check the number of columns because that is what
239  we use to *detect* the header in the first place.
240  Return type: boolean
241  Exceptions : none
242  Caller : internal
243  Status : Stable
244 
245 =cut
246 
247 sub is_file_header_valid {
248  my ( @header ) = @_;
249 
250  my @field_patterns
251  = (
252  qr{ \A [#]? \s* MIM[ ]number }msx,
253  qr{ GeneID }msx,
254  qr{ type }msx,
255  qr{ Source }msx,
256  qr{ MedGenCUI }msx,
257  qr{ Comment }msx,
258  );
259 
260  my $header_field;
261  foreach my $pattern (@field_patterns) {
262  $header_field = shift @header;
263  # Make sure we run the regex match in scalar context
264  return 0 unless scalar ( $header_field =~ m{ $pattern }msx );
265  }
266 
267  # If we have made it this far, all should be in order
268  return 1;
269 }
270 
271 
272 =head2 process_xref_entry
273 
274  Arg [1] : HashRef list of named arguments: FIXME
275  Example : $self->process_xref_entry({...});
276  Description: Wrapper around the most frequently repeated bit of
277  run(): loop over the list of matching
278  EntrezGene xrefs and insert dependent MIM xrefs.
279  Return type: none
280  Exceptions : none
281  Caller : internal
282  Status : Stable
283 
284 =cut
285 
286 sub process_xref_entry {
287  my ( $self, $arg_ref ) = @_;
288 
289  foreach my $ent_id ( @{ $arg_ref->{'entrez_xrefs'} } ) {
290  $arg_ref->{'counters'}->{'dependent_on_entrez'}++;
291  $self->add_dependent_xref_maponly( $arg_ref->{'mim_xref_id'},
292  $arg_ref->{'mim_source_id'},
293  $ent_id,
294  undef,
295  $arg_ref->{'dbi'},
296  1
297  );
298  }
299 
300  return;
301 }
302 
303 
304 1;
map
public map()
XrefParser::MIMParser
Definition: MIMParser.pm:3
XrefParser::BaseParser
Definition: BaseParser.pm:8
XrefParser::EntrezGeneParser
Definition: EntrezGeneParser.pm:21
run
public run()