-
Notifications
You must be signed in to change notification settings - Fork 5
/
deSPI-download
executable file
·227 lines (184 loc) · 6.92 KB
/
deSPI-download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#!/bin/bash
#
#Original Author: Florian Breitwieser
#Original Script: centrifuge-download
#Site: https://github.com/infphilo/centrifuge/blob/master/centrifuge-download
#Revised by Dengfeng Guan
#Email: [email protected]
#Please feel free to contact me if there are problems in the script
#set -eu -o pipefail
set -x
exists() {
command -v "$1" >/dev/null 2>&1
}
if hash wget 2>/dev/null; then
DL_PROG="wget -N --reject=index.html -qO"
else
DL_PROG="curl -s -o"
fi
export DL_PROG
#########################################################
## Functions
function download_n_process() {
IFS=$'\t' read -r TAXID FILEPATH <<< "$1"
NAME=`basename $FILEPATH .gz`
[[ -f "$LIBDIR/$DOMAIN/$NAME.gz" ]] || $DL_PROG "$LIBDIR/$DOMAIN/$NAME.gz" "$FILEPATH" || { printf "\nError downloading $FILEPATH!\n" >&2 && exit 1; }
gunzip -f "$LIBDIR/$DOMAIN/$NAME.gz" ||{ printf "\nError gunzipping $LIBDIR/$DOMAIN/$NAME.gz [ downloaded from $FILEPATH ]!\n" >&2 && exit 255; }
#sed -i "s/^>/>tid|$TAXID|ref|/" $LIBDIR/$DOMAIN/$NAME
echo "$LIBDIR/$DOMAIN/$NAME" $TAXID >>$LIBDIR/ref2tid.map
echo done
}
export -f download_n_process
ceol=`tput el` # terminfo clr_eol
function count {
typeset C=0
while read L; do
if [[ "$L" == "done" ]]; then
C=$(( C 1 ))
_progress=$(( (${C}*100/${1}*100)/100 ))
_done=$(( (${_progress}*4)/10 ))
_left=$(( 40-$_done ))
# Build progressbar string lengths
_done=$(printf "%${_done}s")
_left=$(printf "%${_left}s")
printf "\rProgress : [${_done// /#}${_left// /-}] ${_progress}%% $C/$1" 1>&2
else
echo "$L"
fi
done
}
function check_or_mkdir_no_fail {
#echo -n "Creating $1 ... " >&2
if [[ -d $1 && ! -n `find $1 -prune -empty -type d` ]]; then
echo "Directory exists already! Continuing" >&2
return `true`
else
#echo "Done" >&2
mkdir -p $1
return `true`
fi
}
## Check if GNU parallel exists
command -v parallel >/dev/null 2>&1 && PARALLEL=1 || PARALLEL=0
ALL_GENOMES="bacteria, viral, archaea"
ALL_DATABASES="refseq, genbank, taxonomy"
ALL_ASSEMBLY_LEVELS="Complete\ Genome, Chromosome, Scaffold, Contig"
ALL_REFSEQ_CATEGORY="representitive\ genome, reference\ genome, na"
## Option parsing
DATABASE="refseq"
ASSEMBLY_LEVEL="Complete Genome"
REFSEQ_CATEGORY=""
TAXID=""
DOWNLOAD_GI_MAP=0
BASE_DIR="."
N_PROC=1
CHANGE_HEADER=0
DOWNLOAD_RNA=0
DO_DUST=0
FILTER_UNPLACED=0
USAGE="
`basename $0` [<options>] <database>
ARGUMENT
<database> One of refseq, genbank, or taxonomy:
- use refseq or genbank for genomic sequences,
- taxonomy for taxonomy mappings.
COMMON OPTIONS
-o <directory> Folder to which the files are downloaded. Default: '$BASE_DIR'.
-P <# of threads> Number of processes when downloading (uses xargs). Default: '$N_PROC'
WHEN USING database refseq OR genbank:
-d <domain> What domain to download. One or more of ${ALL_GENOMES} (comma separated, required).
-a <assembly level> Only download genomes with the specified assembly level ($ALL_ASSEMBLY_LEVELS). Default: '$ASSEMBLY_LEVEL'.
-c <refseq category> only download genomes the specified refseq category ($ALL_REFSEQ_CATEGORY). Default: any.
"
# arguments: $OPTFIND (current index), $OPTARG (argument for option), $OPTERR (bash-specific)
while getopts "o:P:d:a:c:g" OPT "$@"; do
case $OPT in
o) BASE_DIR="$OPTARG" ;;
P) N_PROC="$OPTARG" ;;
d) DOMAINS=${OPTARG//,/ } ;;
a) ASSEMBLY_LEVEL="$OPTARG" ;;
c) REFSEQ_CATEGORY="$OPTARG" ;;
g) DOWNLOAD_GI_MAP=1 ;;
\?) echo "Invalid option: -$OPTARG" >&2
exit 1
;;
:) echo "Option -$OPTARG requires an argument." >&2
exit 1
;;
esac
done
shift $((OPTIND-1))
[[ $# -eq 1 ]] || { printf "$USAGE" >&2 && exit 1; };
DATABASE=$1
#echo $DATABASE
#### TAXONOMY DOWNLOAD
FTP="ftp://ftp.ncbi.nih.gov"
if [[ "$DATABASE" == "taxonomy" ]]; then
echo "Downloading NCBI taxonomy ... " >&2
if check_or_mkdir_no_fail "$BASE_DIR"; then
cd "$BASE_DIR" > /dev/null
if [[ "$DOWNLOAD_GI_MAP" == "1" ]]; then
$DL_PROG gi_taxid_nucl.dmp.gz $FTP/pub/taxonomy/gi_taxid_nucl.dmp.gz
gunzip -c gi_taxid_nucl.dmp.gz | sed 's/^/gi|/' > gi_taxid_nucl.map
else
$DL_PROG taxdump.tar.gz $FTP/pub/taxonomy/taxdump.tar.gz
tar -zxvf taxdump.tar.gz nodes.dmp
tar -zxvf taxdump.tar.gz names.dmp
rm taxdump.tar.gz
fi
cd - > /dev/null
fi
exit 0
fi
#### REFSEQ/GENBANK DOWNLOAD
export LIBDIR=$(readlink -f $BASE_DIR)
#export CHANGE_HEADER="$CHANGE_HEADER"
## Fields in the assembly_summary.txt file
REFSEQ_CAT_FIELD=5
TAXID_FIELD=6
SPECIES_TAXID_FIELD=7
VERSION_STATUS_FIELD=11
ASSEMBLY_LEVEL_FIELD=12
FTP_PATH_FIELD=20
AWK_QUERY="\$$ASSEMBLY_LEVEL_FIELD==\"$ASSEMBLY_LEVEL\" && \$$VERSION_STATUS_FIELD==\"latest\""
[[ "$REFSEQ_CATEGORY" != "" ]] && AWK_QUERY="$AWK_QUERY && \$$REFSEQ_CAT_FIELD==\"$REFSEQ_CATEGORY\""
TAXID=${TAXID//,/|}
[[ "$TAXID" != "" ]] && AWK_QUERY="$AWK_QUERY && match(\$$TAXID_FIELD,\"^($TAXID)\$\")"
#echo "$AWK_QUERY" >&2
#echo "Downloading genomes for $DOMAINS at assembly level $ASSEMBLY_LEVEL" >&2
if exists wget; then
wget -qO- --no-remove-listing ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE/ > /dev/null
else
curl -s ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE/ > .listing
fi
#if [[ "$CHANGE_HEADER" == "1" ]]; then
#echo "Modifying header to include taxonomy ID" >&2
#fi
echo $DOMAINS
for DOMAIN in $DOMAINS; do
if [[ ! `grep " $DOMAIN" .listing` ]]; then
echo "$DOMAIN is not a valid domain - use one of the following:" >&2
grep '^d' .listing | sed 's/.* //'
exit 1
fi
#if [[ "$CHANGE_HEADER" != "1" ]]; then
#echo "Writing taxonomy ID to sequence ID map to STDOUT" >&2
#[[ -f "$LIBDIR/$DOMAIN.map" ]] && rm "$LIBDIR/$DOMAIN.map"
#fi
export DOMAIN=$DOMAIN
echo $LIBDIR/$DOMAIN
check_or_mkdir_no_fail $LIBDIR/$DOMAIN
echo $LIBDIR
FULL_ASSEMBLY_SUMMARY_FILE="$LIBDIR/$DOMAIN/assembly_summary.txt"
ASSEMBLY_SUMMARY_FILE="$LIBDIR/$DOMAIN/assembly_summary_filtered.txt"
#echo "Downloading and filtering the assembly_summary.txt file ..." >&2
$DL_PROG "$FULL_ASSEMBLY_SUMMARY_FILE" ftp://ftp.ncbi.nlm.nih.gov/genomes/$DATABASE/$DOMAIN/assembly_summary.txt > "$FULL_ASSEMBLY_SUMMARY_FILE"
#echo $AWK_QUERY
awk -F "\t" "BEGIN {OFS=\"\t\"} $AWK_QUERY" "$FULL_ASSEMBLY_SUMMARY_FILE" > "$ASSEMBLY_SUMMARY_FILE"
N_EXPECTED=`cat "$ASSEMBLY_SUMMARY_FILE" | wc -l`
[[ $N_EXPECTED -gt 0 ]] || { echo "Domain $DOMAIN has no genomes with specified filter." >&2; exit 1; }
echo "Downloading $N_EXPECTED $DOMAIN genomes at assembly level $ASSEMBLY_LEVEL ... (will take a while)" >&2
cut -f "$TAXID_FIELD,$FTP_PATH_FIELD" "$ASSEMBLY_SUMMARY_FILE" | sed 's#\([^/]*\)$#\1/\1_genomic.fna.gz#' |\
tr '\n' '\0' | xargs -0 -n1 -P $N_PROC bash -c 'download_n_process "$@"' _ | count $N_EXPECTED
echo >&2
done