3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
20 package XrefParser::MIMParser;
30 my $QR_TI_FIELD_TERMINATORS
32 (?: # The TI field spans from *FIELD* TI until:
33 [*]FIELD[*] # - the next field in same record, or
34 | [*]RECORD[*] # - the end of current record, or
35 | [*]THEEND[*] # - the end of input file
43 Arg [1] : HashRef standard list of arguments from ParseSource
44 Example : $omim_parser->run({ ... });
45 Description: Extract Online Mendelian Inheritance in Man entries
46 from a text file downloaded from the OMIM Web site,
47 then insert corresponding xrefs into the xref
48 database. Note that all the xrefs produced by
this
49 parser are unmapped and tagged as such; links of
52 OMIM entries can either represent a unique locus,
53 describe a disorder, or both. In Ensembl these are
54 assigned, respectively, to: the source MIM_GENE,
55 the source MIM_MORBID, or independently into both.
57 OMIM records are multiline. Each record begins with a
58 specific tag line and consists of a number of
59 fields. Each field starts with its own start-tag line
60 (i.e. the data proper only appears after a newline) and
61 continues until the beginning of either the next field
62 in the same record, the next record, or the
63 end-of-input tag. The overall structure looks as
82 All the data relevant to the parser can be found in the
86 Exceptions :
throws on all processing errors
87 Caller : ParseSource in the xref pipeline
94 my ( $self, $ref_arg ) = @_;
95 my $general_source_id = $ref_arg->{source_id};
96 my $species_id = $ref_arg->{species_id};
97 my $files = $ref_arg->{files};
98 my $verbose = $ref_arg->{verbose}
99 my $dbi = $ref_arg->{dbi}
101 if ( ( !defined $general_source_id ) or
102 ( !defined $species_id ) or
103 ( !defined $files ) )
105 confess
"Need to pass source_id, species_id and files as pairs";
108 my $filename = @{$files}[0];
115 push @sources, $general_source_id;
118 $self->get_source_id_for_source_name(
"MIM_GENE", undef, $dbi );
119 push @sources, $gene_source_id;
120 my $morbid_source_id =
121 $self->get_source_id_for_source_name(
"MIM_MORBID", undef, $dbi );
122 push @sources, $morbid_source_id;
124 my %TYPE_SINGLE_SOURCES = (
125 q{*} => $gene_source_id,
126 q{} => $morbid_source_id,
127 q{#} => $morbid_source_id,
128 q{%} => $morbid_source_id,
132 print
"sources are: " . join(
", ", @sources ) .
"\n";
135 IO::Handle->input_record_separator(
'*RECORD*');
137 my $mim_io = $self->get_filehandle($filename);
138 if ( !defined $mim_io ) {
139 confess
"Failed to acquire a file handle for '${filename}'";
142 $mim_io->getline(); # first record is empty with *RECORD* as the
146 while ( my $input_record = $mim_io->getline() ) {
148 my $ti = extract_ti( $input_record );
149 if ( ! defined $ti ) {
150 confess
'Failed to extract TI field from record';
153 my ( $type, $number, $long_desc ) = parse_ti( $ti );
154 if ( ! defined $type ) {
155 confess
'Failed to extract record type and description from TI field';
158 # Use the first block of text as description
159 my @fields = split( qr{;;}msx, $long_desc );
160 my $label = $fields[0] .
" [" . $type . $number .
"]";
166 species_id => $species_id,
168 info_type =>
'UNMAPPED',
171 if ( exists $TYPE_SINGLE_SOURCES{$type} ) {
172 my $type_source = $TYPE_SINGLE_SOURCES{$type};
174 $xref_object->{
'source_id'} = $type_source;
175 $counters{ $type_source }++;
176 $self->add_xref($xref_object);
179 elsif ( $type eq q{+} ) { # both gene and phenotype
181 $xref_object->{
'source_id'} = $gene_source_id;
182 $counters{ $gene_source_id }++;
183 $self->add_xref($xref_object);
185 $xref_object->{
'source_id'} = $morbid_source_id;
186 $counters{ $morbid_source_id }++;
187 $self->add_xref($xref_object);
190 elsif ( $type eq q{^} ) {
191 my ( $new_number ) = ( $long_desc =~ m{
195 if ( defined $new_number ) {
196 if ( $new_number ne $number ) {
197 $old_to_new{$number} = $new_number;
200 # Both leading and trailing whitespace has been removed
201 # so don't bother with another regex match, just compare.
202 elsif ( $long_desc eq
'REMOVED FROM DATABASE' ) {
203 $removed{$number} = 1;
204 $counters{
'removed' }++;
207 confess
"Unsupported type of a '^' record: '${long_desc}'\n";
216 # Generate synonyms from "MOVED TO" entries
217 foreach my $mim ( keys %old_to_new ) {
219 my $new = $old_to_new{$old};
221 # Some entries in the MIM database have been moved multiple times,
222 # and we want each of the synonyms created this way to point to
223 # the *current* accession instead of one another. Keep traversing
224 # the chain of renames until we have reached the end, i.e. until
225 # $new is no longer a valid key in %old_to_new.
226 # FIXME: this is not entirely efficient, especially for long
227 # rename chains, because the foreach loop processes every single
228 # key of %old_to_new (i.e. every single "MOVED TO" entry) from
229 # scratch - even though some of them might have already been
230 # encountered in the process of traversing the change chains of
231 # previously encountered keys. Some sort of a cache pointing each
232 # of previously encountered keys to their respective final values,
233 # might be in order here.
234 # FIXME: If we do implement such a cache, compare performance for
235 # retrieving original keys in random order vs in descending
236 # numerical order. On the one hand starting with high accessions
237 # will likely allow us to process rename chains from shorter to
238 # longer ones, thus, maximising the use of the cache; on the other
239 # there is the O(n log n) cost of sorting to take into account.
240 while ( defined( $old_to_new{$new} ) ) {
241 $new = $old_to_new{$new};
244 # With the latest value of $new no longer pointing to anything in
245 # %old_to_new, we have got two options: either we have finally
246 # reached an up-to-date entry number or the entry has ultimately
247 # been removed from the database. See if we have logged the
248 # removal, if we haven't add the synonyms - letting Ensembl figure
249 # out by itself to which of the three (two???) sources the
250 # relevant xrefs belong.
251 if ( !defined( $removed{$new} ) ) {
252 $self->add_to_syn_for_mult_sources( $new, \@sources, $old,
254 $counters{
'synonyms' }++;
259 print $counters{ $gene_source_id } .
' genemap and '
260 . $counters{ $morbid_source_id } .
" phenotype MIM xrefs added\n"
261 . $counters{
'synonyms' } .
" synonyms (defined by MOVED TO) added\n"
262 . $counters{
'removed' } .
" entries removed\n";
271 Arg [1] : String $input_record A single OMIM record
272 Example : my $ti_string = extract_ti( $omim_record );
273 Description: Scan the provided record
for the TI field and extract
274 its contents, regardless of where in the record that
275 field appears or the position of the record in the
285 my ( $input_record ) = @_;
288 = ( $input_record =~ m{
290 (.+?)
# (important: NON-greedy match)
292 $QR_TI_FIELD_TERMINATORS
302 Arg [1] : String $ti Contents of a single TI field
303 Example : my ( $type_symbol, $omim_number, $description )
304 = parse_ti( $ti_string );
305 Description: Extract the type symbol, the entry number and the
306 description from the contents of an OMIM record
's TI
307 field. The description is *not* split into possible
308 individual components, we do however remove line breaks
309 from multiline entries.
312 Caller : MIMParser::run()
320 # Remove line breaks, making sure we do not accidentally concatenate words
329 # Extract the 'type
' and the whole description
330 my @captures = ( $ti =~ m{
332 ([#%+*^]*) # type of entry
333 (\d+) # accession number, same as in NO
334 \s+ # normally just one space
335 (.+) # description of entry