Skip to content

Commit

Permalink
Merge pull request #143 from ccb-hms/feature/rocker-base
Browse files Browse the repository at this point in the history
Feature/rocker base
  • Loading branch information
sam-pullman authored Jan 24, 2024
2 parents 5189165 952b274 commit e22a3e2
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 23 deletions.
26 changes: 20 additions & 6 deletions Code/R/etlVariableCodebook.R
Original file line number Diff line number Diff line change
Expand Up @@ -296,8 296,7 @@ ontology_mappings <- list.files(ontologyMappings)

for (currTable in ontology_mappings) {

# TODO: we really only want to load this single table? then why loop?
if (currTable == "nhanes_variables_mappings.tsv") {
if (currTable != "README.md" & currTable != "non-mappings") {
path = ontologyMappings
loaded_data <- read.csv(file = paste0(path, currTable), sep = "\t")

Expand All @@ -306,26 305,41 @@ for (currTable in ontology_mappings) {
# generate SQL table definitions from column types in tibbles
createTableQuery = DBI::sqlCreateTable(DBI::ANSI(), paste("Ontology", str_extract(currTable, '.*(?=\\.tsv)'), sep="."), loaded_data) # nolint

# change TEXT to VARCHAR(256)
# change TEXT to VARCHAR(512)
createTableQuery = gsub(createTableQuery, pattern = "\" TEXT", replace = "\" VARCHAR(512)", fixed = TRUE) # nolint # nolint

# change DOUBLE to VARCHAR(256)
# change DOUBLE to VARCHAR(512)
createTableQuery = gsub(createTableQuery, pattern = "\" DOUBLE", replace = "\" VARCHAR(512)", fixed = TRUE)

# change SMALLINT to VARCHAR(512)
createTableQuery = gsub(createTableQuery, pattern = "SMALLINT", replace = "VARCHAR(512)", fixed = TRUE)

# remove double quotes, which interferes with the schema specification
createTableQuery = gsub(createTableQuery, pattern = '"', replace = "", fixed = TRUE)

# create the table in SQL
SqlTools::dbSendUpdate(cn, createTableQuery)

# run bulk insert
insertStatement = paste(sep="",
if (currTable == "nhanes_oral_health_mappings.tsv") {
insertStatement = paste(sep="",
"BULK INSERT Ontology.",
str_extract(currTable, '.*(?=\\.tsv)'),
" FROM '",
paste0(path, currTable),
"' WITH (KEEPNULLS, TABLOCK, ROWS_PER_BATCH=2000, FIRSTROW=2, FIELDTERMINATOR = '\t', ROWTERMINATOR = '\n')"
"' WITH (KEEPNULLS, TABLOCK, ROWS_PER_BATCH=2000, FIRSTROW=2, FIELDTERMINATOR = '\t', ROWTERMINATOR = '\r\n')"
)
}
else{
insertStatement = paste(sep="",
"BULK INSERT Ontology.",
str_extract(currTable, '.*(?=\\.tsv)'),
" FROM '",
paste0(path, currTable),
"' WITH (KEEPNULLS, TABLOCK, ROWS_PER_BATCH=2000, FIRSTROW=2, FIELDTERMINATOR = '\t', ROWTERMINATOR = '\n')"
)
}

SqlTools::dbSendUpdate(cn, insertStatement)

# keep memory as clean as possible
Expand Down
33 changes: 19 additions & 14 deletions Container/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -152,18 152,18 @@ RUN mkdir -p -m 770 /var/opt/mssql && chgrp -R 0 /var/opt/mssql
# Install and configure database connectivity components
#------------------------------------------------------------------------------

# install FreeTDS driver
WORKDIR /tmp
RUN wget ftp://ftp.freetds.org/pub/freetds/stable/freetds-1.1.40.tar.gz
RUN tar zxvf freetds-1.1.40.tar.gz
RUN cd freetds-1.1.40 && ./configure --enable-krb5 && make && make install
RUN rm -r /tmp/freetds*

# tell unixodbc where to find the FreeTDS driver shared object
RUN echo '\n\
[FreeTDS]\n\
Driver = /usr/local/lib/libtdsodbc.so \n\
' >> /etc/odbcinst.ini
# # install FreeTDS driver
# WORKDIR /tmp
# RUN wget ftp://ftp.freetds.org/pub/freetds/stable/freetds-1.1.40.tar.gz
# RUN tar zxvf freetds-1.1.40.tar.gz
# RUN cd freetds-1.1.40 && ./configure --enable-krb5 && make && make install
# RUN rm -r /tmp/freetds*

# # tell unixodbc where to find the FreeTDS driver shared object
# RUN echo '\n\
# [FreeTDS]\n\
# Driver = /usr/local/lib/libtdsodbc.so \n\
# ' >> /etc/odbcinst.ini

# install pyodbc
RUN pip3 install pyodbc
Expand Down Expand Up @@ -240,7 240,7 @@ RUN Rscript -e "remotes::install_cran('optparse')"

#install nhanesA and phonto from github in order to have the latest version (CRAN will take a while to update)
RUN Rscript -e "devtools::install_github('ccb-hms/phonto')"
RUN Rscript -e "devtools::install_github('cjendres1/nhanes')"
RUN Rscript -e "devtools::install_github('cjendres1/nhanes@97f1850')"

#------------------------------------------------------------------------------
# Download and DB Build Process
Expand Down Expand Up @@ -299,7 299,7 @@ RUN git clone https://github.com/ccb-hms/NHANES-metadata.git
WORKDIR /tmp/NHANES-metadata

RUN git fetch --tags
RUN git checkout tags/v3.8.2
RUN git checkout tags/v3.9.1

# install packages, which require newer version of rvest in conflict with the download / build script
RUN Rscript -e "remotes::install_version('DT', repos='https://packagemanager.posit.co/cran/__linux__/focal/2022-11-04', upgrade=TRUE, version='0.26')"
Expand Down Expand Up @@ -353,9 353,14 @@ RUN chmod -R 777 /usr/local/lib/R/library
RUN chmod -R 777 /usr/local/lib/R/doc/html/packages.html

# Declare which version of the container this is, and make it available inside the container
<<<<<<< HEAD
ENV EPICONDUCTOR_CONTAINER_VERSION v0.4.0
ENV EPICONDUCTOR_COLLECTION_DATE 2024-01-24
=======

ENV EPICONDUCTOR_CONTAINER_VERSION v0.3.0
ENV EPICONDUCTOR_COLLECTION_DATE 2023-12-19
>>>>>>> origin/feature/rocker-base

RUN echo "EPICONDUCTOR_CONTAINER_VERSION=$EPICONDUCTOR_CONTAINER_VERSION" >> /usr/local/lib/R/etc/Renviron.site
RUN echo "EPICONDUCTOR_COLLECTION_DATE=$EPICONDUCTOR_COLLECTION_DATE" >> /usr/local/lib/R/etc/Renviron.site
Expand Down
20 changes: 17 additions & 3 deletions Testing/Code/containerBuildTests.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 64,6 @@ if (is.na(grep(pattern, Sys.getenv("EPICONDUCTOR_CONTAINER_VERSION"), value = TR
}

# Stops build if the date is not in YYYY-MM-DD format
!is.na(as.Date(Sys.getenv("EPICONDUCTOR_COLLECTION_DATE"), format="%Y-%m-%d"))
if (is.na(as.Date(Sys.getenv("EPICONDUCTOR_COLLECTION_DATE"), format="%Y-%m-%d"))) {
stop(paste("Docker Container Environment Variable EPICONDUCTOR_COLLECTION_DATE: ", Sys.getenv("EPICONDUCTOR_COLLECTION_DATE")," not in format YYYY-MM-DD"), sep='')
}
Expand Down Expand Up @@ -106,7 105,7 @@ mismatchedCols(entailed_edges, "entailed_edges")
labels = c("Subject", "Object", "IRI", "DiseaseLocation", "Ontology", "Direct", "Inherited")
mismatchedCols(labels, "labels")

nhanes_variables_mappings = c("Variable", "TableName", "SourceTermID", "SourceTerm", "MappedTermLabel", "MappedTermCURIE", "MappedTermIRI", "MappingScore", "Tags", "Ontology")
nhanes_variables_mappings = c("Variable", "TableName", "SourceTermID", "SourceTerm", "MappedTermLabel", "MappedTermCURIE", "MappedTermIRI", "MappingScore", "Ontology")
mismatchedCols(nhanes_variables_mappings, "nhanes_variables_mappings")

##################################################################################################################
Expand Down Expand Up @@ -146,7 145,7 @@ if (nrow(DBI::dbGetQuery(cn, rawToQuestionnaire))>0) {

##################################################################################################################
# TEST: Raw and Translated tables have the same row counts
# RESULT:
# RESULT: if any tables have mismatched row counts, stop the build
##################################################################################################################

for (i in 1:nrow(allTableNames)) {
Expand Down Expand Up @@ -297,6 296,7 @@ logError <- function(tableName, variable){
)
)
}

for (i in 1:nrow(codebook)) {
variable = codebook[i, "Variable"]
tableName = codebook[i, "TableName"]
Expand Down Expand Up @@ -338,3 338,17 @@ insertStatement = paste(sep="",
)

SqlTools::dbSendUpdate(cn, insertStatement)

# shrink transaction log
SqlTools::dbSendUpdate(cn, "DBCC SHRINKFILE(NhanesLandingZone_log)")

# issue checkpoint
SqlTools::dbSendUpdate(cn, "CHECKPOINT")


#Check phonto and nhanesA installs
if (packageVersion("phonto")!="0.0.9"){stop(paste("Phonto installation failure or version 0.0.9 not matched."), sep='')}
if (packageVersion("nhanesA")!="1.0"){stop(paste("nhanesA installation failure or version 1.0 not matched."), sep='')}

# shutdown the database engine cleanly
SqlTools::dbSendUpdate(cn, "SHUTDOWN")

0 comments on commit e22a3e2

Please sign in to comment.