3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
20 package XrefParser::Mim2GeneParser;
22 # For non-destructive substitutions in regexps (/r flag)
35 my $EXPECTED_NUMBER_OF_COLUMNS = 6;
39 Arg [1] : HashRef standard list of arguments from ParseSource
40 Example : $m2g_parser->run({ ... });
41 Description: Extract mappings between OMIM genes and other gene
42 identifiers from a tab-delimited file downloaded from
43 the DBASS Web site, then insert corresponding links
44 into the xref database:
45 -
for entries mapped to Ensembl genes, we create
46 gene_direct_xref links;
47 - otherwise,
if an entry is mapped to an EntrezGene ID
48 that exists in the xref database we creare a
50 In either
case we update info_type of OMIM xrefs
53 DEPENDENCIES: This parser must be
run after:
54 -
MIMParser - without existing OMIM entries
this
59 mim2gene_medgen file begins with a header line with
60 6 tab separated columns:
61 #MIM number GeneID type Source MedGenCUI Comment
62 MIM number and GeneID are considered
for the xrefs.
65 Exceptions :
throws on all processing errors
66 Caller : ParseSource in the xref pipeline
73 my ( $self, $ref_arg ) = @_;
74 my $general_source_id = $ref_arg->{source_id};
75 my $species_id = $ref_arg->{species_id};
76 my $files = $ref_arg->{files};
77 my $verbose = $ref_arg->{verbose}
78 my $dbi = $ref_arg->{dbi}
80 if ( ( !defined $general_source_id ) or
81 ( !defined $species_id ) or
84 confess
"Need to pass source_id, species_id and files as pairs";
87 my $csv = Text::CSV->new({
90 || confess
'Failed to initialise CSV parser: ' . Text::CSV->error_diag();
92 my $filename = @{$files}[0];
94 my $m2g_io = $self->get_filehandle($filename);
95 if ( !defined $m2g_io ) {
96 confess
"Could not open file '${filename}'";
99 my $mim_gene_source_id =
100 $self->get_source_id_for_source_name(
'MIM_GENE', undef, $dbi );
101 my $mim_morbid_source_id =
102 $self->get_source_id_for_source_name(
'MIM_MORBID', undef, $dbi );
103 my $entrez_source_id =
104 $self->get_source_id_for_source_name(
'EntrezGene', undef, $dbi );
106 # This will be used to prevent insertion of duplicates
107 $self->get_dependent_mappings( $mim_gene_source_id, $dbi );
108 $self->get_dependent_mappings( $mim_morbid_source_id, $dbi );
110 # FIXME: should we abort if any of these comes back empty?
112 %{ $self->get_valid_codes(
"MIM_GENE", $species_id, $dbi ) };
114 %{ $self->get_valid_codes(
"MIM_MORBID", $species_id, $dbi ) };
116 %{ $self->get_valid_codes(
"EntrezGene", $species_id, $dbi ) };
118 # Initialise all counters to 0 so that we needn't handle possible undefs
119 # while printing the summary
122 'dependent_on_entrez' => 0,
123 'missed_master' => 0,
128 while ( my $line = $csv->getline( $m2g_io ) ) {
136 # At present we identify the header line among other comments by
137 # checking if it has the expected number of tab-delimited
138 # columns, which of course means we cannot identify header lines
139 # with too few or too many column names. However, this should be
140 # mostly harmless - something would have to be very, very wrong
141 # with the input file for the header to have the wrong number of
142 # column names without a change in the number of actual columns
144 if ( ( scalar @{ $line } == $EXPECTED_NUMBER_OF_COLUMNS )
145 && ( ! is_file_header_valid( @{ $line } ) ) ) {
146 confess
"Malformed or unexpected header in Mim2Gene file '${filename}'";
151 if ( scalar @{ $line } != $EXPECTED_NUMBER_OF_COLUMNS ) {
152 confess
' Line ' . $csv->record_number()
153 .
" of input file '${filename}' has an incorrect number of columns";
156 # Do not modify the contents of @{$line}, only the output - hence the /r.
157 my ( $omim_acc, $entrez_id, $type, $source, $medgen, $comment )
158 =
map { s{\s+\z}{}rmsx } @{ $line };
160 $counters{
'all_entries'}++;
162 # No point in doing anything if we have no matching MIM xref...
163 if ( ( !defined $mim_gene{$omim_acc} ) &&
164 ( !defined $mim_morbid{$omim_acc} ) )
166 $counters{
'missed_omim'}++;
170 # ...or no EntrezGene xref to match it to
171 if ( ( ( ! $entrez_id ) || ( ! defined $entrez{$entrez_id} ) ) ) {
172 $counters{
'missed_master'}++;
176 # An unknown type might indicate the change of input format,
177 # therefore make sure the user notices it. That said, do not
178 # bother we do not have an xref this entry would operate on anyway
179 # - which is why we only check this after the preceding two
182 if ( ( $type ne
'gene')
183 && ( $type ne
'gene/phenotype' )
184 && ( $type ne
'predominantly phenotypes' )
185 && ( $type ne
'phenotype' ) ) {
186 warn
"Unknown type $type for MIM Number '${omim_acc}' "
187 .
"(${filename}:" . $csv->record_number() .
")";
190 # With all the checks taken care of, insert the mappings. We check
191 # both MIM_GENE and MIM_MORBID every time because some MIM entries
192 # can appear in both.
193 foreach my $mim_xref_id ( @{ $mim_gene{$omim_acc} } ) {
194 $self->process_xref_entry({
195 'mim_xref_id' => $mim_xref_id,
196 'mim_source_id' => $mim_gene_source_id,
197 'entrez_xrefs' => $entrez{$entrez_id},
198 'entrez_source_id' => $entrez_source_id,
199 'counters' => \%counters,
203 foreach my $mim_xref_id ( @{ $mim_morbid{$omim_acc} } ) {
204 $self->process_xref_entry({
205 'mim_xref_id' => $mim_xref_id,
206 'mim_source_id' => $mim_morbid_source_id,
207 'entrez_xrefs' => $entrez{$entrez_id},
208 'entrez_source_id' => $entrez_source_id,
209 'counters' => \%counters,
216 $csv->eof || confess
'Error parsing CSV: ' . $csv->error_diag();
220 print
'Processed ' . $counters{
'all_entries'} .
" entries. Out of those\n"
221 .
"\t" . $counters{
'missed_omim'} .
" had missing OMIM entries,\n"
222 .
"\t" . $counters{
'dependent_on_entrez'} .
" were dependent EntrezGene xrefs,\n"
223 .
"\t" . $counters{
'missed_master'} .
" had missing master entries.\n";
230 =head2 is_file_header_valid
232 Arg [1..N] : list of column names provided by Text::CSV::getline()
233 Example : if ( ! is_file_header_valid( $csv->getline( $fh ) ) {
234 confess
'Bad header';
236 Description: Verifies
if the header of a Mim2Gene file follows expected
238 We
do not check the number of columns because that is what
239 we use to *detect* the header in the first place.
247 sub is_file_header_valid {
252 qr{ \A [#]? \s* MIM[ ]number }msx,
261 foreach my $pattern (@field_patterns) {
262 $header_field = shift @header;
263 # Make sure we run the regex match in scalar context
264 return 0 unless scalar ( $header_field =~ m{ $pattern }msx );
267 # If we have made it this far, all should be in order
272 =head2 process_xref_entry
274 Arg [1] : HashRef list of named arguments: FIXME
275 Example : $self->process_xref_entry({...});
276 Description: Wrapper around the most frequently repeated bit of
277 run(): loop over the list of matching
278 EntrezGene xrefs and insert dependent MIM xrefs.
286 sub process_xref_entry {
287 my ( $self, $arg_ref ) = @_;
289 foreach my $ent_id ( @{ $arg_ref->{
'entrez_xrefs'} } ) {
290 $arg_ref->{
'counters'}->{
'dependent_on_entrez'}++;
291 $self->add_dependent_xref_maponly( $arg_ref->{
'mim_xref_id'},
292 $arg_ref->{
'mim_source_id'},