ensembl-hive  2.8.1
xref_config2sql.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 # $Id$
19 
20 ########################################################################
21 # #
22 # This script will take the 'xref_config.ini' configuration #
23 # file (or whatever file name given on the command line) and #
24 # convert it into a SQL file that can be used in place of the old #
25 # 'populate_metadata.sql' file found in the 'sql' subdirectory. #
26 # #
27 # The output from this script should be redirected to a file that #
28 # you manually run to populate your Xref database, just as was done #
29 # with 'populate_metadata.sql'. The safest thing to do is just to #
30 # overwrite 'sql/populate_metadata.sql' with the output of this #
31 # script. This will ensure that 'xref_parser.pl populates the Xref #
32 # database with the correct data. #
33 # #
34 ########################################################################
35 
36 use strict;
37 use warnings;
38 
39 use Config::IniFiles;
40 my $file = (defined $ARGV[0] && -f $ARGV[0]) ? $ARGV[0] : 'xref_config.ini';
41 warn "using ", $file;
42 
43 my $preparse = defined $ARGV[1] ? $ARGV[1] : 0;
44 
45 my $config = Config::IniFiles->new(-file => $file);
46 if(! defined $config) {
47  foreach my $e (@Config::IniFiles::errors) {
48  warn "errors found";
49  warn $e;
50  }
51  die "No Xref config made from $file. Check STDERR";
52 }
53 
54 my %source_ids;
55 
56 # Do the species.
57 
58 print('#' x 80, "\n");
59 print("# SPECIES\n");
60 print("\n");
61 
62 foreach my $section ( $config->GroupMembers('species') ) {
63  my $species_name = substr( $section, 8 );
64 
65  my @taxonomy_ids =
66  split( /\n/, $config->val( $section, 'taxonomy_id' ) );
67 
68  my $species_id = $taxonomy_ids[0];
69 
70  printf( "# Species '%s' (id = %d)\n", $species_name, $species_id );
71 
72  foreach my $taxonomy_id (@taxonomy_ids) {
73  print( "INSERT INTO species "
74  . "(species_id, taxonomy_id, name, aliases)\n" );
75 
76  printf( "VALUES (%d, %d, '%s', '%s');\n",
77  $species_id, $taxonomy_id, $species_name,
78  $config->val( $section, 'aliases' ) || $species_name );
79  }
80 
81  print("\n");
82 }
83 
84 # Do the sources.
85 
86 print( '#' x 80, "\n" );
87 print("# SOURCES\n");
88 print("\n");
89 
90 my $source_id = 0;
91 foreach my $source_section ( sort( $config->GroupMembers('source') ) ) {
92  my ( $spaces, $source_name ) =
93  $source_section =~ /^source(\s+)(\S+)\s*$/;
94 
95  if ( length($spaces) > 1 ) {
96  die( sprintf("Too many spaces between the words 'source' and '%s'\n"
97  . "while reading source section '[%s]'\n",
98  $source_name, $source_section ) );
99  }
100 
101 # if ( exists( $source_ids{$source_section} ) ) {
102 # # Won't happen because Config::IniFile will combine the configs
103 # # of multiple sections with the same name into one section with
104 # # multi-value values. Sigh...
105 # die( sprintf( "The source section '[%s]' occurs more than once\n",
106 # $source_section ) );
107 # }
108 
109  if ( index( $config->val( $source_section, 'name' ), "\n" ) != -1 ) {
110  die( sprintf( "The source section '[%s]' occurs more\n"
111  . "than once in the configuration file\n",
112  $source_section ) );
113  }
114 
115  $source_ids{$source_section} = ++$source_id;
116 
117  my $priority_description = defined $config->val( $source_section, 'prio_descr') ? $config->val ( $source_section, 'prio_descr') : '';
118 
119  printf( "# Source '%s' (id = %d)\n", $source_name, $source_id );
120 
121  print( "INSERT INTO source "
122  . "(name, source_release, ordered, "
123  . "priority, priority_description, status)\n" );
124 
125  printf( "VALUES ('%s', '1', %d, %d, '%s', '%s');\n",
126  $config->val( $source_section, 'name' ),
127  $config->val( $source_section, 'order' ),
128  $config->val( $source_section, 'priority' ),
129  $priority_description,
130  $config->val($source_section, 'status', 'NOIDEA') );
131 
132  print("\n");
133 
134  my @dependents =
135  split( /\,/, $config->val( $source_section, 'dependent_on', '' ) );
136 
137  foreach my $dep (@dependents){
138  print "# adding source dependency that $source_section needs $dep loaded first\n";
139  print "INSERT IGNORE INTO dependent_source (master_source_id, dependent_name)\n";
140  printf( "VALUES (%d, '%s');\n\n", $source_ids{$source_section}, $dep);
141  }
142 
143 } ## end foreach my $source_section ...
144 
145 # Do the data files.
146 
147 print( '#' x 80, "\n" );
148 print("# DATA FILES\n");
149 print("\n");
150 
151 foreach my $species_section ( sort( $config->GroupMembers('species') ) )
152 {
153  my ( $spaces, $species_name ) =
154  $species_section =~ /^species(\s+)(\S+)\s*$/;
155 
156  if ( length($spaces) > 1 ) {
157  die( sprintf(
158  "Too many spaces between the words 'species' and '%s'\n"
159  . "while reading species section '[%s]'\n",
160  $species_name, $species_section ) );
161  }
162 
163  my @taxonomy_ids =
164  split( /\n/, $config->val( $species_section, 'taxonomy_id' ) );
165 
166  my $species_id = $taxonomy_ids[0];
167 
168  print( '#', '-' x 79, "\n" );
169  printf( "# Data for species '%s' (id = %d)\n",
170  $species_name, $species_id );
171  print( '#', '-' x 79, "\n" );
172  print("\n");
173 
174  foreach my $source_name (
175  sort( split( /\n/, $config->val( $species_section, 'source' ) ) ) )
176  {
177  my $source_section = sprintf( "source %s", $source_name );
178  $source_section =~ s/\s$//;
179 
180  if ( !exists( $source_ids{$source_section} ) ) {
181  die( sprintf( "Can not find source section '[%s]'\n"
182  . "while reading species section '[%s]'\n",
183  $source_section, $species_section ) );
184  }
185 
186  printf( "# Data from source '%s' (id = %d)\n",
187  $source_name, $source_ids{$source_section} );
188 
189  print( "INSERT INTO source_url "
190  . "(source_id, species_id, parser)\n" );
191 
192  my $parser = (defined($config->val($source_section, 'old_parser')) && !$preparse ? $config->val($source_section, 'old_parser') : $config->val($source_section, 'parser'));
193 
194  printf( "VALUES (%d, %d, '%s') ;\n",
195  $source_ids{$source_section}, $species_id,
196  $parser );
197 
198  print("\n");
199 
200  } ## end foreach my $source_name ( sort...)
201 } ## end foreach my $species_section...
202 
203 print "# FINISHED SUCCESSFULLY\n"