ensembl-hive  2.8.1
import_recombinant_hotspots.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 # Designed to work on data retrieved from
19 # https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20110106_recombination_hotspots/
20 #
21 # Imports the recombination hotspots from 1000 Genomes into Ensembl
22 
23 use strict;
24 use warnings;
25 
31 
32 use Getopt::Long;
33 use File::Fetch;
34 use IO::Dir;
35 use IO::File;
36 
37 my ($url,$db_name,$db_host,$db_user,$db_pass,$db_port,$db_version,$help,$species);
38 $species = "human";
39 
40 GetOptions ("url=s" => \$url,
41  "db_name=s" => \$db_name,
42  "db_host=s" => \$db_host,
43  "db_user=s" => \$db_user,
44  "db_pass=s" => \$db_pass,
45  "db_port=s" => \$db_port,
46  "db_version=s" => \$db_version,
47  "species=s" => \$species,
48  "h!" => \$help,
49  "help!" => \$help,
50 );
51 
52 if ($help) {&usage; exit;}
53 unless ($url and $db_name and $db_host) {print "Insufficient arguments\n"; &usage; exit;}
54 
55 
56 my $dba = new Bio::EnsEMBL::DBSQL::DBAdaptor(
57  -species => $species,
58  -group => 'core',
59  -dbname => $db_name,
60  -host => $db_host,
61  -user => $db_user,
62  -pass => $db_pass,
63  -port => $db_port,
64  -db_version => $db_version,
65 );
66 
67 Bio::EnsEMBL::Registry->add_DBAdaptor($species,'core',$dba);
68 
69 #Bio::EnsEMBL::Registry->load_registry_from_db(
70 # -host => $db_host,
71 # -user => $db_user,
72 # -pass => $db_pass,
73 # -port => $db_port,
74 # -db_version => $db_version,
75 #);
76 
77 my $file_fetch = File::Fetch->new(uri=>$url);
78 my $archive_path = $file_fetch->fetch() or die "Unable to get data from given URL. ".$file_fetch->error;
79 
80 system('tar','-xzf',$archive_path);
81 
82 my %directory;
83 tie %directory, 'IO::Dir',".";
84 foreach my $file (keys %directory) {
85  if ($file =~ /\.txt$/) {
86  my $fh = IO::File->new($file);
87  &process_file($fh);
88  $fh->close;
89  }
90 }
91 print "Finished. Feel free to delete downloaded data in this directory.\n";
92 
93 sub process_file {
94  my $file_handle = shift;
95  <$file_handle>; #strip header
96 
97  my $simple_feature_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species,'core', 'SimpleFeature');
98  my $slice_adaptor = Bio::EnsEMBL::Registry->get_adaptor($species,'core', 'Slice');
99 
100  my $analysis = new Bio::EnsEMBL::Analysis (
101  -description => "The map was generated using the HapMap Phase II data and human genome assembly NCBI35 using LDhat as described in the 2007 HapMap paper (Nature, 18th Sept 2007).
102 
103 The map was then converted from NCBI35 to GRCh37 coordinates and inspected for regions in which
104 the genome assembly had be rearranged.
105 
106 See https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20110106_recombination_hotspots/",
107  -display_label => 'HapMap Phase II genetic recombination map',
108  -displayable => 1,
109  -logic_name => 'human_1kg_hapmap_phase_2',
110  );
111 
112  my $previous_chromosome;
113  my $slice;
114  my @features;
115  while (my $line = <$file_handle>) {
116  my ($chromosome,$position,$score) = split(/\t+/,$line);
117  $chromosome =~ s/^chr//; # chr prefix is for UCSC
118  $chromosome =~ s/\_.+$//; # for removing PAR info
119  if (!$slice || $previous_chromosome ne $chromosome) {
120  $slice = $slice_adaptor->fetch_by_region('toplevel', $chromosome);
121  }
122  my $simple_feature = Bio::EnsEMBL::SimpleFeature->new(
123  -start => $position,
124  -end => $position,
125  -score => $score,
126  -analysis => $analysis,
127  -slice => $slice,
128  -strand => 1,
129  -display_label => 'Recombination hotspot',
130  );
131 
132  push @features, $simple_feature;
133  $previous_chromosome = $chromosome;
134  }
135  $simple_feature_adaptor->store(@features);
136 }
137 
138 sub usage {
139  print "Launching instructions:
140  Run from a folder you are happy to have filled with files.
141 
142 Options:
143 
144  -url Supply the URL to download from
145  -db_name The DB to add these features to
146  -db_host Hostname for the DB
147  -db_user
148  -db_pass
149  -db_port
150  -db_version
151  -species
152 
153  -help
154 ";
155 }
Bio::EnsEMBL::Registry::get_adaptor
public Adaptor get_adaptor()
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
Bio::EnsEMBL::Registry
Definition: Registry.pm:113
Bio::EnsEMBL::DBSQL::SimpleFeatureAdaptor
Definition: SimpleFeatureAdaptor.pm:31
Bio::EnsEMBL::Analysis
Definition: PairAlign.pm:3
process_file
public process_file()
Bio::EnsEMBL::SimpleFeature
Definition: SimpleFeature.pm:31
Bio::EnsEMBL::Registry::add_DBAdaptor
public void add_DBAdaptor()
usage
public usage()
Bio::EnsEMBL::SimpleFeature::new
public Bio::EnsEMBL::Feature new()