3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
26 A parser
class to parse the MGI (descriptions) source. Creates 'MISC' xref using MGI
accession with description and
27 also creates the synonyms extracted from the pipe seperated synonym_field
29 -species = mus_musculus
33 -columns = [
accession chromosome position start end strand label status marker marker_type feature_type synonym_field]
41 files => [
"MRK_List2.rpt"],
46 package XrefParser::MGI_Desc_Parser;
55 my $EXPECTED_NUMBER_OF_COLUMNS = 12;
61 The
run method does the actual parsing and creation of xrefs and synonyms.
62 Parser gets initialized as noted above and
run is called from
63 Bio::EnsEMBL::Production::Pipeline::Xrefs::ParseSource
69 my ($self, $ref_arg) = @_;
70 my $source_id = $ref_arg->{source_id};
71 my $species_id = $ref_arg->{species_id};
72 my $files = $ref_arg->{files};
73 my $verbose = $ref_arg->{verbose}
74 my $dbi = $ref_arg->{dbi}
76 if ( ( !defined $source_id ) or
77 ( !defined $species_id ) or
78 ( !defined $files ) ) {
79 confess
'Need to pass source_id, species_id and files as pairs';
82 my $file = @{$files}[0];
84 my $mgi_io = $self->get_filehandle($file);
85 if ( !defined $mgi_io ) {
86 confess
"Could not open $file\n";
89 my $input_file = Text::CSV->new({
95 }) or confess
"Cannot use file $file: " . Text::CSV->error_diag();
101 # read and validate header
102 if ( ! is_file_header_valid( $input_file->header( $mgi_io ) ) ) {
103 confess
"Malformed or unexpected header in MGI_Desc file '${file}'";
106 while ( my $data = $input_file->getline($mgi_io) ) {
107 my $accession = $data->[0];
108 my $marker = $data->[8];
110 $acc_to_xref{$accession} = $self->add_xref({
114 source_id => $source_id,
115 species_id => $species_id,
119 if ( $verbose && !$marker ) {
120 print
"$accession has no description\n";
124 if ( defined $acc_to_xref{$accession} ) {
126 my $synonym_field = $data->[11];
127 if ( $synonym_field ) {
128 @synonyms = split qr{ [|] }msx, $synonym_field;
130 foreach my $syn (@synonyms) {
131 $self->add_synonym( $acc_to_xref{$accession}, $syn, $dbi );
136 } ## end
while ( my $data = $input_file...)
139 || confess
"Error parsing file $file: " . $input_file->error_diag();
143 print
"$xref_count MGI Description Xrefs added\n";
144 print
"$syn_count synonyms added\n";
147 return 0; #successful
151 =head2 is_file_header_valid
153 Arg [1..N] : list of column names provided by Text::CSV::header()
154 Example : if ( ! is_file_header_valid( $csv->header( $fh ) ) {
155 confess
'Bad header';
157 Description: Verifies
if the header of a MGI_Desc file follows
166 sub is_file_header_valid {
169 # Don't bother with parsing column names if their number does not
170 # match to begin with
171 if ( scalar @header != $EXPECTED_NUMBER_OF_COLUMNS ) {
180 'genome coordinate start',
181 'genome coordinate end',
188 'marker synonyms (pipe-separated)',
192 foreach my $pattern (@field_patterns) {
193 $header_field = shift @header;
194 if ( $header_field ne $pattern ) {
199 # If we have made it this far, all should be in order