Skip to content

Commit

Permalink
v0.3.0 changes to latest rocker-base
Browse files Browse the repository at this point in the history
  • Loading branch information
sam-pullman committed Dec 13, 2023
1 parent 999df65 commit 2148d37
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 231 deletions.
146 changes: 64 additions & 82 deletions Code/R/download.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 20,7 @@ options(timeout=100)
optionList = list(
optparse::make_option(c("--container-build"), type="logical", default=FALSE,
help="is this script running inside of a container build process", metavar="logical"),

optparse::make_option(c("--include-exclusions"), type="logical", default=FALSE,
help="whether or not to exclude the tables in Code/R/excluded_tables.tsv", metavar="logical")
);
Expand All @@ -40,6 40,7 @@ sqlUserName = "sa"
sqlPassword = "yourStrong(!)Password"
sqlDefaultDb = "master"


# loop waiting for SQL Server database to become available
for (i in 1:60) {
cn = tryCatch(
Expand Down Expand Up @@ -77,60 78,36 @@ persistTextFiles = FALSE

outputDirectory = "/NHANES/Data"

# Try using the comprehensive listing
comprehensiveHtmlDataList = "https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx"
htmlFileList = readLines(comprehensiveHtmlDataList)
htmlTableStartLine = grep(x = htmlFileList, pattern = "<table")
htmlTableEndLine = grep(x = htmlFileList, pattern = "</table")

if (length(htmlTableStartLine) != 1 | length(htmlTableEndLine) != 1 ) {
stop(
paste(
"The original HTML file listing at",
comprehensiveHtmlDataList,
"contained only one table. You will need to do some investigation and debugging."
)
)
}

# Convert the HTML table to a data frame so we can iterate on the rows
htmlObj = xml2::read_html(paste(collapse="\n", htmlFileList[htmlTableStartLine : htmlTableEndLine]))

fileListTable = dplyr::`%>%`(htmlObj, rvest::html_table())[[1]]

# Create a new column 'Data File Name' populated with each file name from the 'Doc File' column without ' Doc'. eg. 'ACQ_D Doc' -> 'ACQ_D'
fileListTable$'Data File Name' <- gsub(" Doc","",as.character(fileListTable$'Doc File'))

# Some years in the year column do not match up with the year value in the url. eg. 'SSHCV' year
# column shows '2007-2012' but the url is https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/SSHCV_E.XPT.
# lines 75-78 correct where this occurs in the table
fileListTable$Years[fileListTable$Years == "1988-2020"] <- '1999-2000'
fileListTable$Years[fileListTable$Years == "2007-2012"] <- '2007-2008'
fileListTable$Years[fileListTable$Years == "1999-2004"] <- '1999-2000'
fileListTable$Years[fileListTable$Years == "1999-2020"] <- '1999-2000'
fileListTable$Years[fileListTable$Years == "2017-2020"] <- '2017-2018'

# Replace the hyperlink in the Doc File column with the full url
fileListTable$'Doc File' <- glue::glue("https://wwwn.cdc.gov/Nchs/Nhanes/{fileListTable$Years}/{fileListTable$'Doc File'}.htm")
fileListTable$'Doc File'<-gsub(" Doc","",as.character(fileListTable$'Doc File'))

# Replace the hyperlink in the Data File column with the full url
fileListTable$'Data File' <- glue::glue("https://wwwn.cdc.gov/Nchs/Nhanes/{fileListTable$Years}/{fileListTable$'Data File'}.XPT")
fileListTable$'Data File'<- gsub('([A-z] ) .*', '\\1', as.character(fileListTable$'Data File'))
fileListTable$'Data File' <- paste0(fileListTable$'Data File', ".XPT")
fileListTable$'Data File'<-gsub(" Data","",as.character(fileListTable$'Data File'))
fileListTable = nhanesA::nhanesManifest("public")

excludedTables = read.csv("/NHANES/excluded_tables.tsv", sep='\t')
ex <- nhanesA::nhanesManifest("limited")['Table']
reasons <- rep("Limited Access",length(ex))
ex <- cbind(ex, reasons)
colnames(ex)[colnames(ex) == "Table"] ="TableName"
colnames(ex)[colnames(ex) == "reasons"] ="Reason"
excludedTables <- rbind(excludedTables, ex)

# write the table to file
write.table(
excludedTables,
file = "/NHANES/excluded_tables.tsv",
sep = "\t",
na = "",
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)

if (!opt[["include-exclusions"]]) {
fileListTable <- fileListTable[!grepl(paste(excludedTables$TableName, collapse = "|"), fileListTable$'Data File Name'),]
fileListTable <- fileListTable[!grepl(paste(excludedTables$TableName, collapse = "|"), fileListTable$'Table'),]
} else{
fileListTable <- fileListTable[grepl(paste(excludedTables$TableName, collapse = "|"), fileListTable$'Data File Name'),]
fileListTable <- fileListTable[grepl(paste(excludedTables$TableName, collapse = "|"), fileListTable$'Table'),]
}

# enumerate distinct data types
fileListTable$"Data File Name" <- strtrim(fileListTable$"Data File Name", 128)
dataTypes = unique(fileListTable$"Data File Name")
fileListTable$"Table" <- strtrim(fileListTable$"Table", 128)
dataTypes = unique(fileListTable$"Table")

# fix case-differing strings
dataTypes = sort(dataTypes)
Expand All @@ -142,21 119,6 @@ names(uniqueUpper) = dataTypes[representativeStringIndex]
dataTypes = names(uniqueUpper)
names(dataTypes) = uniqueUpper

cnames = colnames(fileListTable)
fileListTable = cbind(fileListTable, dataTypes[toupper(fileListTable$"Data File Name")])
colnames(fileListTable) = c(cnames, "ScrubbedDataType")

#--------------------------------------------------------------------------------------------------------
# performance notes for large XPTs:
# 14.6G for a single PAXMIN
# 10G after gc()
# baloons to 32G after second read
# 20G after gc()
# 40 G during bind_rows
# 20 G after rm XPTs and gc()
# 12G file
#--------------------------------------------------------------------------------------------------------


if (!opt[["include-exclusions"]]){
# create landing zone for the raw data, set recovery mode to simple
Expand All @@ -171,6 133,27 @@ if (!opt[["include-exclusions"]]){

SqlTools::dbSendUpdate(cn, "USE NhanesLandingZone")

# create the ExcludedTables table in SQL
SqlTools::dbSendUpdate(cn, "
CREATE TABLE NhanesLandingZone.Metadata.ExcludedTables (
TableName varchar(64),
Reason varchar(64)
)
")

# run bulk insert
insertStatement = paste(sep="", "
BULK INSERT NhanesLandingZone.Metadata.ExcludedTables FROM '/NHANES/excluded_tables.tsv'
WITH (KEEPNULLS, TABLOCK, ROWS_PER_BATCH=2000, FIRSTROW=2, FIELDTERMINATOR = '\t', ROWTERMINATOR = '\n')")

SqlTools::dbSendUpdate(cn, insertStatement)

# shrink transaction log
SqlTools::dbSendUpdate(cn, "DBCC SHRINKFILE(NhanesLandingZone_log)")

# issue checkpoint
SqlTools::dbSendUpdate(cn, "CHECKPOINT")

# prevent scientific notation
options(scipen = 15)

Expand All @@ -180,6 163,18 @@ questionnaireVariables = dplyr::tibble(
TableName=character()
)

#--------------------------------------------------------------------------------------------------------
# performance notes for large XPTs:
# 14.6G for a single PAXMIN
# 10G after gc()
# baloons to 32G after second read
# 20G after gc()
# 40 G during bind_rows
# 20 G after rm XPTs and gc()
# 12G file
#--------------------------------------------------------------------------------------------------------


# enable restart
i=1
downloadErrors = dplyr::tibble(
Expand All @@ -189,12 184,11 @@ downloadErrors = dplyr::tibble(
)

for (i in i:length(dataTypes)) {
# for (i in 1:500) {
# get the name of the data type
currDataType = dataTypes[i]
currDataType = toupper(dataTypes[i])

# find all rows with URLs that should be relevant to the current data type
rowsForCurrDataType = which(fileListTable[,"ScrubbedDataType"] == currDataType)
rowsForCurrDataType = which(fileListTable[,"Table"] == currDataType)

# assemble a list containing all of the subtables for this data type
dfList = list()
Expand All @@ -203,28 197,17 @@ for (i in i:length(dataTypes)) {
for (currRow in rowsForCurrDataType) {

# get the URL for the SAS file pointed to by the current row
currFileUrl = fileListTable[currRow, "Data File"]
currFileUrl = fileListTable[currRow, "DataURL"]

# get the date range for this table
currYears = fileListTable[currRow, "Years"]

# split the URL on '/' to extract the file name
urlSplit = strsplit(x = currFileUrl, split = "/", fixed = TRUE)[[1]]
fileName = urlSplit[length(urlSplit)]

#TODO move these to the exlusions group above^^^
cat("reading ", currFileUrl, "\n")

# attempt to download each file and log errors
result = tryCatch({
currTemp = tempfile()
utils::download.file(
url = currFileUrl,
destfile = currTemp
)
z = haven::read_xpt(currTemp)
file.remove(currTemp)
z
nhanesA::nhanesFromURL(currFileUrl, translated = FALSE)
}, warning = function(w) {
downloadErrors <<- dplyr::bind_rows(
downloadErrors,
Expand Down Expand Up @@ -257,12 240,11 @@ for (i in i:length(dataTypes)) {
result = dplyr::bind_cols(result, years)
}


questionnaireVariables =
dplyr::bind_rows(
questionnaireVariables,
dplyr::bind_cols(
"Variable" = colnames(result),
"Variable" = toupper(colnames(result)),
"TableName" = rep(currDataType, times = ncol(result))
)
)
Expand Down Expand Up @@ -315,7 297,7 @@ for (i in i:length(dataTypes)) {
for (currCharColumn in ixCharacterColumns) {

# fix any embedded line endings
m[,currCharColumn] = gsub(pattern = "[\r\n]", replacement = "", x = dplyr::pull(m[,currCharColumn]), useBytes = TRUE)
m[,currCharColumn] = gsub(m[,currCharColumn], pattern = "\r\n", replacement = "", useBytes = TRUE, fixed = TRUE)
}
}

Expand Down Expand Up @@ -358,7 340,7 @@ for (i in i:length(dataTypes)) {
currDataType,
" FROM '",
currOutputFileName,
"' WITH (KEEPNULLS, TABLOCK, ROWS_PER_BATCH=2000, FIRSTROW=1, FIELDTERMINATOR='\t')"
"' WITH (KEEPNULLS, TABLOCK, ROWS_PER_BATCH=2000, FIRSTROW=1, FIELDTERMINATOR='\t', ROWTERMINATOR = '\n')"
)

SqlTools::dbSendUpdate(cn, insertStatement)
Expand Down
38 changes: 7 additions & 31 deletions Code/R/etlVariableCodebook.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 14,6 @@ tablesFile = paste(sep = "/", getwd(), "metadata/nhanes_tables.tsv")
variablesFile = paste(sep = "/", getwd(), "metadata/nhanes_variables.tsv")
ontologyMappings = paste(sep = "/", getwd(), "ontology-mappings/")
ontologyTables = paste(sep = "/", getwd(), "ontology-tables/")
excludedTables = "/NHANES/excluded_tables.tsv"


# parameters to connect to SQL
sqlHost = "localhost"
Expand Down Expand Up @@ -61,7 59,7 @@ suppressWarnings({
SqlTools::dbSendUpdate(cn, "
CREATE TABLE NhanesLandingZone.Metadata.VariableCodebook (
Variable varchar(64),
TableName varchar(64),
[Table] varchar(64),
CodeOrValue varchar(64),
ValueDescription varchar(256),
Count int,
Expand All @@ -73,34 71,11 @@ SqlTools::dbSendUpdate(cn, "
# run bulk insert
insertStatement = paste(sep="", "
BULK INSERT NhanesLandingZone.Metadata.VariableCodebook FROM '", codebookFile, "'
WITH (FORMAT='CSV', KEEPNULLS, TABLOCK, ROWS_PER_BATCH=2000, FIRSTROW=2, FIELDTERMINATOR='\t', ROWTERMINATOR = '\n')
WITH (FORMAT='CSV', KEEPNULLS, TABLOCK, ROWS_PER_BATCH=2000, FIRSTROW=2, FIELDTERMINATOR='\t')
")

SqlTools::dbSendUpdate(cn, insertStatement)

#TODO: make this table more comprehensive, to invlude suffixes as well as prefixes
# create the ExcludedTables table in SQL
SqlTools::dbSendUpdate(cn, "
CREATE TABLE NhanesLandingZone.Metadata.ExcludedTables (
TableName varchar(64),
Reason varchar(64)
)
")

# run bulk insert
insertStatement = paste(sep="", "
BULK INSERT NhanesLandingZone.Metadata.ExcludedTables FROM '", excludedTables, "'
WITH (KEEPNULLS, TABLOCK, ROWS_PER_BATCH=2000, FIRSTROW=2, FIELDTERMINATOR = '\t', ROWTERMINATOR = '\r\n')")


SqlTools::dbSendUpdate(cn, insertStatement)

# shrink transaction log
SqlTools::dbSendUpdate(cn, "DBCC SHRINKFILE(NhanesLandingZone_log)")

# issue checkpoint
SqlTools::dbSendUpdate(cn, "CHECKPOINT")

# load the table descriptions

# create the nhanes_tables table in SQL
Expand All @@ -121,7 96,7 @@ SqlTools::dbSendUpdate(cn, "
# run bulk insert
insertStatement = paste(sep="", "
BULK INSERT ##tmp_nhanes_tables FROM '", tablesFile, "'
WITH (KEEPNULLS, TABLOCK, ROWS_PER_BATCH=2000, FIRSTROW=2, FIELDTERMINATOR='\t')
WITH (FORMAT='CSV', KEEPNULLS, TABLOCK, ROWS_PER_BATCH=2000, FIRSTROW=2, FIELDTERMINATOR='\t')
")

SqlTools::dbSendUpdate(cn, insertStatement)
Expand Down Expand Up @@ -182,7 157,7 @@ SqlTools::dbSendUpdate(cn, "
[Table] varchar(64),
SASLabel varchar(64),
EnglishText varchar(1024),
Target varchar(128),
Target varchar(max),
UseConstraints varchar(128),
ProcessedText varchar(1024),
Tags varchar(1024),
Expand All @@ -200,14 175,15 @@ insertStatement = paste(sep="", "

SqlTools::dbSendUpdate(cn, insertStatement)


# add columns to QuestionnaireVariables table to accommodate additional data
SqlTools::dbSendUpdate(cn, "
ALTER TABLE NhanesLandingZone.Metadata.QuestionnaireVariables
ADD
Description varchar(1024) NULL,
Target varchar(128) NULL,
Target varchar(max) NULL,
SasLabel varchar(64),
UseConstraints varchar(64),
UseConstraints varchar(128),
ProcessedText varchar(1024),
Tags varchar(1024),
VariableID varchar(1024),
Expand Down
6 changes: 3 additions & 3 deletions Code/Sql/spTranslateTable.sql
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 12,7 @@ AS

-- check that the variable codebook actually has data for this table
DECLARE @variableTranslationCount INT
SELECT @variableTranslationCount = COUNT(*) FROM Metadata.VariableCodebook C WHERE C.TableName = @SourceTableName
SELECT @variableTranslationCount = COUNT(*) FROM Metadata.VariableCodebook C WHERE C.[Table] = @SourceTableName

-- if there are no translatable variables for this table, just copy it over
IF @variableTranslationCount = 0
Expand Down Expand Up @@ -79,7 79,7 @@ AS
FROM
#tmpColNames C
INNER JOIN Metadata.VariableCodebook V ON
V.TableName = @SourceTableName
V.[Table] = @SourceTableName
AND C.COLUMN_NAME = V.Variable
AND V.ValueDescription = 'Range of Values'
WHERE C.COLUMN_NAME != @pkColName
Expand Down Expand Up @@ -182,7 182,7 @@ AS
LEFT OUTER JOIN Metadata.VariableCodebook V ON
T.Variable = V.Variable
AND CAST(T.Response AS VARCHAR(256)) = CAST(V.CodeOrValue AS VARCHAR(256))
AND V.TableName = ''' @SourceTableName '''
AND V.[Table] = ''' @SourceTableName '''
'

-- debugging
Expand Down
Loading

0 comments on commit 2148d37

Please sign in to comment.