my ( $self, $ref_arg ) = @_;
my $general_source_id = $ref_arg->{source_id};
my $species_id = $ref_arg->{species_id};
my $files = $ref_arg->{files};
my $verbose = $ref_arg->{verbose}
my $dbi = $ref_arg->{dbi}
if ( ( !defined $general_source_id ) or
( !defined $species_id ) or
( !defined $files ) )
{
confess "Need to pass source_id, species_id and files as pairs";
}
my $csv = Text::CSV->new({
sep_char => "\t",
})
|| confess 'Failed to initialise CSV parser: ' . Text::CSV->error_diag();
my $filename = @{$files}[0];
my $m2g_io = $self->get_filehandle($filename);
if ( !defined $m2g_io ) {
confess "Could not open file '${filename}'";
}
my $mim_gene_source_id =
$self->get_source_id_for_source_name( 'MIM_GENE', undef, $dbi );
my $mim_morbid_source_id =
$self->get_source_id_for_source_name( 'MIM_MORBID', undef, $dbi );
my $entrez_source_id =
$self->get_source_id_for_source_name( 'EntrezGene', undef, $dbi );
# This will be used to prevent insertion of duplicates
$self->get_dependent_mappings( $mim_gene_source_id, $dbi );
$self->get_dependent_mappings( $mim_morbid_source_id, $dbi );
# FIXME: should we abort if any of these comes back empty?
my (%mim_gene) =
%{ $self->get_valid_codes( "MIM_GENE", $species_id, $dbi ) };
my (%mim_morbid) =
%{ $self->get_valid_codes( "MIM_MORBID", $species_id, $dbi ) };
my (%entrez) =
%{ $self->get_valid_codes( "EntrezGene", $species_id, $dbi ) };
# Initialise all counters to 0 so that we needn't handle possible undefs
# while printing the summary
my %counters = (
'all_entries' => 0,
'dependent_on_entrez' => 0,
'missed_master' => 0,
'missed_omim' => 0,
);
RECORD:
while ( my $line = $csv->getline( $m2g_io ) ) {
my ( $is_comment )
= ( $line->[0] =~ m{
\A
([#])?
}msx );
if ( $is_comment ) {
# At present we identify the header line among other comments by
# checking if it has the expected number of tab-delimited
# columns, which of course means we cannot identify header lines
# with too few or too many column names. However, this should be
# mostly harmless - something would have to be very, very wrong
# with the input file for the header to have the wrong number of
# column names without a change in the number of actual columns
# in data rows.
if ( ( scalar @{ $line } == $EXPECTED_NUMBER_OF_COLUMNS )
confess "Malformed or unexpected header in Mim2Gene file '${filename}'";
}
next RECORD;
}
if ( scalar @{ $line } != $EXPECTED_NUMBER_OF_COLUMNS ) {
confess ' Line ' . $csv->record_number()
. " of input file '${filename}' has an incorrect number of columns";
}
# Do not modify the contents of @{$line}, only the output - hence the /r.
my ( $omim_acc, $entrez_id, $type, $source, $medgen, $comment )
=
map { s{\s+\z}{}rmsx } @{ $line };
$counters{'all_entries'}++;
# No point in doing anything if we have no matching MIM xref...
if ( ( !defined $mim_gene{$omim_acc} ) &&
( !defined $mim_morbid{$omim_acc} ) )
{
$counters{'missed_omim'}++;
next RECORD;
}
# ...or no EntrezGene xref to match it to
if ( ( ( ! $entrez_id ) || ( ! defined $entrez{$entrez_id} ) ) ) {
$counters{'missed_master'}++;
next RECORD;
}
# An unknown type might indicate the change of input format,
# therefore make sure the user notices it. That said, do not
# bother we do not have an xref this entry would operate on anyway
# - which is why we only check this after the preceding two
# presence checks.
if ( ( $type ne 'gene')
&& ( $type ne 'gene/phenotype' )
&& ( $type ne 'predominantly phenotypes' )
&& ( $type ne 'phenotype' ) ) {
warn "Unknown type $type for MIM Number '${omim_acc}' "
. "(${filename}:" . $csv->record_number() . ")";
}
# With all the checks taken care of, insert the mappings. We check
# both MIM_GENE and MIM_MORBID every time because some MIM entries
# can appear in both.
foreach my $mim_xref_id ( @{ $mim_gene{$omim_acc} } ) {
$self->process_xref_entry({
'mim_xref_id' => $mim_xref_id,
'mim_source_id' => $mim_gene_source_id,
'entrez_xrefs' => $entrez{$entrez_id},
'entrez_source_id' => $entrez_source_id,
'counters' => \%counters,
'dbi' => $dbi,
});
}
foreach my $mim_xref_id ( @{ $mim_morbid{$omim_acc} } ) {
$self->process_xref_entry({
'mim_xref_id' => $mim_xref_id,
'mim_source_id' => $mim_morbid_source_id,
'entrez_xrefs' => $entrez{$entrez_id},
'entrez_source_id' => $entrez_source_id,
'counters' => \%counters,
'dbi' => $dbi,
});
}
} ## end record loop
$csv->eof || confess 'Error parsing CSV: ' . $csv->error_diag();
$m2g_io->close();
if ( $verbose ) {
print 'Processed ' . $counters{'all_entries'} . " entries. Out of those\n"
. "\t" . $counters{'missed_omim'} . " had missing OMIM entries,\n"
. "\t" . $counters{'dependent_on_entrez'} . " were dependent EntrezGene xrefs,\n"
. "\t" . $counters{'missed_master'} . " had missing master entries.\n";
}
return 0;