ensembl-hive  2.7.0
MIMParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 package XrefParser::MIMParser;
21 
22 use strict;
23 use warnings;
24 
25 use Carp;
26 
27 use parent qw( XrefParser::BaseParser );
28 
29 
30 my $QR_TI_FIELD_TERMINATORS
31  = qr{
32  (?: # The TI field spans from *FIELD* TI until:
33  [*]FIELD[*] # - the next field in same record, or
34  | [*]RECORD[*] # - the end of current record, or
35  | [*]THEEND[*] # - the end of input file
36  )
37  }msx;
38 
39 
40 
41 =head2 run
42 
43  Arg [1] : HashRef standard list of arguments from ParseSource
44  Example : $omim_parser->run({ ... });
45  Description: Extract Online Mendelian Inheritance in Man entries
46  from a text file downloaded from the OMIM Web site,
47  then insert corresponding xrefs into the xref
48  database. Note that all the xrefs produced by this
49  parser are unmapped and tagged as such; links of
50  appropriate type will be inserted by Mim2GeneParser.
51 
52  OMIM entries can either represent a unique locus,
53  describe a disorder, or both. In Ensembl these are
54  assigned, respectively, to: the source MIM_GENE,
55  the source MIM_MORBID, or independently into both.
56 
57  OMIM records are multiline. Each record begins with a
58  specific tag line and consists of a number of
59  fields. Each field starts with its own start-tag line
60  (i.e. the data proper only appears after a newline) and
61  continues until the beginning of either the next field
62  in the same record, the next record, or the
63  end-of-input tag. The overall structure looks as
64  follows:
65 
66  *RECORD*
67  *FIELD* NO
68  *FIELD* TI
69  *FIELD* TX
70  ...
71  *RECORD*
72  *FIELD* NO
73  *FIELD* TI
74  ...
75  *RECORD*
76  *FIELD* NO
77  ...
78  *FIELD* CD
79  *FIELD* ED
80  *THEEND*
81 
82  All the data relevant to the parser can be found in the
83  TI field.
84 
85  Return type: none
86  Exceptions : throws on all processing errors
87  Caller : ParseSource in the xref pipeline
88  Status : Stable
89 
90 =cut
91 
92 sub run {
93 
94  my ( $self, $ref_arg ) = @_;
95  my $general_source_id = $ref_arg->{source_id};
96  my $species_id = $ref_arg->{species_id};
97  my $files = $ref_arg->{files};
98  my $verbose = $ref_arg->{verbose} // 0;
99  my $dbi = $ref_arg->{dbi} // $self->dbi;
100 
101  if ( ( !defined $general_source_id ) or
102  ( !defined $species_id ) or
103  ( !defined $files ) )
104  {
105  confess "Need to pass source_id, species_id and files as pairs";
106  }
107 
108  my $filename = @{$files}[0];
109 
110  my %old_to_new;
111  my %removed;
112  my %counters;
113  my @sources;
114 
115  push @sources, $general_source_id;
116 
117  my $gene_source_id =
118  $self->get_source_id_for_source_name( "MIM_GENE", undef, $dbi );
119  push @sources, $gene_source_id;
120  my $morbid_source_id =
121  $self->get_source_id_for_source_name( "MIM_MORBID", undef, $dbi );
122  push @sources, $morbid_source_id;
123 
124  my %TYPE_SINGLE_SOURCES = (
125  q{*} => $gene_source_id,
126  q{} => $morbid_source_id,
127  q{#} => $morbid_source_id,
128  q{%} => $morbid_source_id,
129  );
130 
131  if ($verbose) {
132  print "sources are: " . join( ", ", @sources ) . "\n";
133  }
134 
135  IO::Handle->input_record_separator('*RECORD*');
136 
137  my $mim_io = $self->get_filehandle($filename);
138  if ( !defined $mim_io ) {
139  confess "Failed to acquire a file handle for '${filename}'";
140  }
141 
142  $mim_io->getline(); # first record is empty with *RECORD* as the
143  # record seperator
144 
145  RECORD:
146  while ( my $input_record = $mim_io->getline() ) {
147 
148  my $ti = extract_ti( $input_record );
149  if ( ! defined $ti ) {
150  confess 'Failed to extract TI field from record';
151  }
152 
153  my ( $type, $number, $long_desc ) = parse_ti( $ti );
154  if ( ! defined $type ) {
155  confess 'Failed to extract record type and description from TI field';
156  }
157 
158  # Use the first block of text as description
159  my @fields = split( qr{;;}msx, $long_desc );
160  my $label = $fields[0] . " [" . $type . $number . "]";
161 
162  my $xref_object = {
163  acc => $number,
164  label => $label,
165  desc => $long_desc,
166  species_id => $species_id,
167  dbi => $dbi,
168  info_type => 'UNMAPPED',
169  };
170 
171  if ( exists $TYPE_SINGLE_SOURCES{$type} ) {
172  my $type_source = $TYPE_SINGLE_SOURCES{$type};
173 
174  $xref_object->{'source_id'} = $type_source;
175  $counters{ $type_source }++;
176  $self->add_xref($xref_object);
177 
178  }
179  elsif ( $type eq q{+} ) { # both gene and phenotype
180 
181  $xref_object->{'source_id'} = $gene_source_id;
182  $counters{ $gene_source_id }++;
183  $self->add_xref($xref_object);
184 
185  $xref_object->{'source_id'} = $morbid_source_id;
186  $counters{ $morbid_source_id }++;
187  $self->add_xref($xref_object);
188 
189  }
190  elsif ( $type eq q{^} ) {
191  my ( $new_number ) = ( $long_desc =~ m{
192  MOVED\sTO\s
193  (\d+)
194  }msx );
195  if ( defined $new_number ) {
196  if ( $new_number ne $number ) {
197  $old_to_new{$number} = $new_number;
198  }
199  }
200  # Both leading and trailing whitespace has been removed
201  # so don't bother with another regex match, just compare.
202  elsif ( $long_desc eq 'REMOVED FROM DATABASE' ) {
203  $removed{$number} = 1;
204  $counters{ 'removed' }++;
205  }
206  else {
207  confess "Unsupported type of a '^' record: '${long_desc}'\n";
208  }
209 
210  }
211 
212  } ## record loop
213 
214  $mim_io->close();
215 
216  # Generate synonyms from "MOVED TO" entries
217  foreach my $mim ( keys %old_to_new ) {
218  my $old = $mim;
219  my $new = $old_to_new{$old};
220 
221  # Some entries in the MIM database have been moved multiple times,
222  # and we want each of the synonyms created this way to point to
223  # the *current* accession instead of one another. Keep traversing
224  # the chain of renames until we have reached the end, i.e. until
225  # $new is no longer a valid key in %old_to_new.
226  # FIXME: this is not entirely efficient, especially for long
227  # rename chains, because the foreach loop processes every single
228  # key of %old_to_new (i.e. every single "MOVED TO" entry) from
229  # scratch - even though some of them might have already been
230  # encountered in the process of traversing the change chains of
231  # previously encountered keys. Some sort of a cache pointing each
232  # of previously encountered keys to their respective final values,
233  # might be in order here.
234  # FIXME: If we do implement such a cache, compare performance for
235  # retrieving original keys in random order vs in descending
236  # numerical order. On the one hand starting with high accessions
237  # will likely allow us to process rename chains from shorter to
238  # longer ones, thus, maximising the use of the cache; on the other
239  # there is the O(n log n) cost of sorting to take into account.
240  while ( defined( $old_to_new{$new} ) ) {
241  $new = $old_to_new{$new};
242  }
243 
244  # With the latest value of $new no longer pointing to anything in
245  # %old_to_new, we have got two options: either we have finally
246  # reached an up-to-date entry number or the entry has ultimately
247  # been removed from the database. See if we have logged the
248  # removal, if we haven't add the synonyms - letting Ensembl figure
249  # out by itself to which of the three (two???) sources the
250  # relevant xrefs belong.
251  if ( !defined( $removed{$new} ) ) {
252  $self->add_to_syn_for_mult_sources( $new, \@sources, $old,
253  $species_id, $dbi );
254  $counters{ 'synonyms' }++;
255  }
256  }
257 
258  if ($verbose) {
259  print $counters{ $gene_source_id } . ' genemap and '
260  . $counters{ $morbid_source_id } . " phenotype MIM xrefs added\n"
261  . $counters{ 'synonyms' } . " synonyms (defined by MOVED TO) added\n"
262  . $counters{ 'removed' } . " entries removed\n";
263  }
264 
265  return 0;
266 } ## end sub run
267 
268 
269 =head2 extract_ti
270 
271  Arg [1] : String $input_record A single OMIM record
272  Example : my $ti_string = extract_ti( $omim_record );
273  Description: Scan the provided record for the TI field and extract
274  its contents, regardless of where in the record that
275  field appears or the position of the record in the
276  file.
277  Return type: String
278  Exceptions : none
279  Caller : MIMParser::run()
280  Status : Stable
281 
282 =cut
283 
284 sub extract_ti {
285  my ( $input_record ) = @_;
286 
287  my ( $ti )
288  = ( $input_record =~ m{
289  [*]FIELD[*]\sTI\n
290  (.+?) # (important: NON-greedy match)
291  \n?
292  $QR_TI_FIELD_TERMINATORS
293  }msx );
294 
295  return $ti;
296 }
297 
298 
299 
300 =head2 parse_ti
301 
302  Arg [1] : String $ti Contents of a single TI field
303  Example : my ( $type_symbol, $omim_number, $description )
304  = parse_ti( $ti_string );
305  Description: Extract the type symbol, the entry number and the
306  description from the contents of an OMIM record's TI
307  field. The description is *not* split into possible
308  individual components, we do however remove line breaks
309  from multiline entries.
310  Return type: Array
311  Exceptions : none
312  Caller : MIMParser::run()
313  Status : Stable
314 
315 =cut
316 
317 sub parse_ti {
318  my ( $ti ) = @_;
319 
320  # Remove line breaks, making sure we do not accidentally concatenate words
321  $ti =~ s{
322  (?:
323  ;;\n
324  | \n;;
325  )
326  }{;;}gmsx;
327  $ti =~ s{\n}{ }gmsx;
328 
329  # Extract the 'type' and the whole description
330  my @captures = ( $ti =~ m{
331  \A
332  ([#%+*^]*) # type of entry
333  (\d+) # accession number, same as in NO
334  \s+ # normally just one space
335  (.+) # description of entry
336  }msx );
337 
338  return @captures;
339 }
340 
341 
342 
343 1;
XrefParser::BaseParser
Definition: BaseParser.pm:8
run
public run()
XrefParser::Mim2GeneParser
Definition: Mim2GeneParser.pm:5