Example: SQL Generation

2092 words · 10 min read

options(width = 250)
options(max.print = 30)
library(dplyr, warn.conflicts = F)

yumltordbschema::build_rdb_data()

build_rdb_data = function(data_model_dir) {
  build_datamodel_sdb.sh = system.file("bash/build_datamodel_sdb.sh", package = "yumltordbschema")
  system2(build_datamodel_sdb.sh, data_model_dir)

  build_yuml_data_model(data_model_dir)
  den = update_new_entities(data_model_dir)
  dfl = update_new_fields(den, data_model_dir)
  return(list(data_entity = den, data_field = dfl))
}

yumltordbschema::build_yuml_data_model()

lines = c("[enum_value| enum_value_id INT PK; enum_var_id INT FK; enum_value_name TEXT; ]",
  "[enum_var| enum_var_id INT PK; enum_var_name TEXT; ]",
  "[place| place_id INT PK; address TEXT; province TEXT; place_enum INT FK; ]")
lines
## [1] "[enum_value| enum_value_id INT PK; enum_var_id INT FK; enum_value_name TEXT; ]" "[enum_var| enum_var_id INT PK; enum_var_name TEXT; ]"                           "[place| place_id INT PK; address TEXT; province TEXT; place_enum INT FK; ]"
ydm = lines %>%
  stringr::str_replace_all( "^[ \\[]*", "" ) %>%
  stringr::str_replace_all( "[ \\]]*$", "" ) %>%
  stringr::str_replace_all( "\\|\\s*", "\\|" ) %>%
  stringr::str_replace_all( ";\\s*", ";" ) %>%
  dplyr::data_frame( ln = . ) %>%
  tidyr::separate( ln, c("entity_name", "columns"), "\\|" ) %>%
  tidyr::unnest( columns = strsplit( columns, ";") ) %>%
  tidyr::separate( columns, c("data_field_name", "type", "pk_fk"), " " ) %>%
  dplyr::mutate( pk_fk = ifelse( is.na(pk_fk), "NON_KEY", pk_fk)) %>%
  dplyr::mutate( type = ifelse( is.na(type), "TEXT", type)) %>%
  dplyr::mutate( entity_name = tolower(entity_name) ) %>%
  dplyr::mutate( data_field_name = tolower(data_field_name) ) 
## Warning: Too few values at 4 locations: 3, 5, 7, 8
ydm
## # A tibble: 9 x 4
##   entity_name data_field_name  type   pk_fk
##         <chr>           <chr> <chr>   <chr>
## 1  enum_value   enum_value_id   INT      PK
## 2  enum_value     enum_var_id   INT      FK
## 3  enum_value enum_value_name  TEXT NON_KEY
## 4    enum_var     enum_var_id   INT      PK
## 5    enum_var   enum_var_name  TEXT NON_KEY
## 6       place        place_id   INT      PK
## 7       place         address  TEXT NON_KEY
## 8       place        province  TEXT NON_KEY
## 9       place      place_enum   INT      FK

yumltordbschema::update_new_entities()

den = ydm %>%
  dplyr::distinct( entity_name, .keep_all = T) %>%
  dplyr::select( entity_name ) %>%
  dplyr::mutate( data_entity_id = row_number() ) %>%
  dplyr::select( data_entity_id, dplyr::everything() )
den
## # A tibble: 3 x 2
##   data_entity_id entity_name
##            <int>       <chr>
## 1              1  enum_value
## 2              2    enum_var
## 3              3       place

yumltordbschema::update_new_fields()

dfl_p1 = ydm %>%
  dplyr::inner_join( den, by = "entity_name" ) %>%
  dplyr::select(-entity_name) %>%
  dplyr::mutate( data_field_id = row_number() ) %>%
  dplyr::select( data_field_id, dplyr::everything() )
dfl_p1
## # A tibble: 9 x 5
##   data_field_id data_field_name  type   pk_fk data_entity_id
##           <int>           <chr> <chr>   <chr>          <int>
## 1             1   enum_value_id   INT      PK              1
## 2             2     enum_var_id   INT      FK              1
## 3             3 enum_value_name  TEXT NON_KEY              1
## 4             4     enum_var_id   INT      PK              2
## 5             5   enum_var_name  TEXT NON_KEY              2
## 6             6        place_id   INT      PK              3
## 7             7         address  TEXT NON_KEY              3
## 8             8        province  TEXT NON_KEY              3
## 9             9      place_enum   INT      FK              3
dfl_fk_id = dfl_p1 %>%
  dplyr::select( data_field_id, data_field_name, pk_fk ) %>%
  dplyr::filter( pk_fk == "FK" ) %>%
  dplyr::filter( rutils::greplm( data_field_name, "_id$") ) %>%
  dplyr::mutate( fk_data_entity_name = stringr::str_sub(data_field_name, end = -4)) %>%
  dplyr::select( data_field_id, fk_data_entity_name ) 
dfl_fk_id
## # A tibble: 1 x 2
##   data_field_id fk_data_entity_name
##           <int>               <chr>
## 1             2            enum_var
dfl_fk_enum = dfl_p1 %>%
  dplyr::select( data_field_id, data_field_name, pk_fk ) %>%
  dplyr::filter( pk_fk == "FK" ) %>%
  dplyr::mutate( enum_var_name = data_field_name) %>%
  dplyr::filter( rutils::greplm( data_field_name, "_enum$") ) %>%
  dplyr::mutate( fk_data_entity_name = "enum_value") %>%
  dplyr::select( data_field_id, fk_data_entity_name, enum_var_name ) 
dfl_fk_enum
## # A tibble: 1 x 3
##   data_field_id fk_data_entity_name enum_var_name
##           <int>               <chr>         <chr>
## 1             9          enum_value    place_enum
den_id = den %>%
  dplyr::select( data_entity_id, entity_name )
dfl_fk = dplyr::bind_rows(dfl_fk_id, dfl_fk_enum) %>%
  dplyr::left_join( den_id, by = c("fk_data_entity_name" = "entity_name") ) %>%
  dplyr::select(-fk_data_entity_name) %>%
  dplyr::select(data_field_id, fk_data_entity_id = data_entity_id, dplyr::everything()) 
dfl_fk
## # A tibble: 2 x 3
##   data_field_id fk_data_entity_id enum_var_name
##           <int>             <int>         <chr>
## 1             2                 2          <NA>
## 2             9                 1    place_enum
dfl = dfl_p1 %>%
  dplyr::left_join( dfl_fk, by = "data_field_id" )
dfl
## # A tibble: 9 x 7
##   data_field_id data_field_name  type   pk_fk data_entity_id fk_data_entity_id enum_var_name
##           <int>           <chr> <chr>   <chr>          <int>             <int>         <chr>
## 1             1   enum_value_id   INT      PK              1                NA          <NA>
## 2             2     enum_var_id   INT      FK              1                 2          <NA>
## 3             3 enum_value_name  TEXT NON_KEY              1                NA          <NA>
## 4             4     enum_var_id   INT      PK              2                NA          <NA>
## 5             5   enum_var_name  TEXT NON_KEY              2                NA          <NA>
## 6             6        place_id   INT      PK              3                NA          <NA>
## 7             7         address  TEXT NON_KEY              3                NA          <NA>
## 8             8        province  TEXT NON_KEY              3                NA          <NA>
## 9             9      place_enum   INT      FK              3                 1    place_enum

Check enum fields:

dfl_fk %>% dplyr::filter(!is.na(enum_var_name))
## # A tibble: 1 x 3
##   data_field_id fk_data_entity_id enum_var_name
##           <int>             <int>         <chr>
## 1             9                 1    place_enum
dfl %>% dplyr::filter(!is.na(enum_var_name))
## # A tibble: 1 x 7
##   data_field_id data_field_name  type pk_fk data_entity_id fk_data_entity_id enum_var_name
##           <int>           <chr> <chr> <chr>          <int>             <int>         <chr>
## 1             9      place_enum   INT    FK              3                 1    place_enum

yumltordbschema::build_ddl()

Initial input arguments:

data_entity = den
data_field = dfl
den = data_entity 
dfl = data_field 
den
## # A tibble: 3 x 2
##   data_entity_id entity_name
##            <int>       <chr>
## 1              1  enum_value
## 2              2    enum_var
## 3              3       place
dfl
## # A tibble: 9 x 7
##   data_field_id data_field_name  type   pk_fk data_entity_id fk_data_entity_id enum_var_name
##           <int>           <chr> <chr>   <chr>          <int>             <int>         <chr>
## 1             1   enum_value_id   INT      PK              1                NA          <NA>
## 2             2     enum_var_id   INT      FK              1                 2          <NA>
## 3             3 enum_value_name  TEXT NON_KEY              1                NA          <NA>
## 4             4     enum_var_id   INT      PK              2                NA          <NA>
## 5             5   enum_var_name  TEXT NON_KEY              2                NA          <NA>
## 6             6        place_id   INT      PK              3                NA          <NA>
## 7             7         address  TEXT NON_KEY              3                NA          <NA>
## 8             8        province  TEXT NON_KEY              3                NA          <NA>
## 9             9      place_enum   INT      FK              3                 1    place_enum
get_den_pk = function(den, dfl) {
  dfl_pk = dfl %>%
    dplyr::filter(pk_fk == "PK") %>%
    dplyr::select(-pk_fk) %>%
    dplyr::select(pk_data_field_id = data_field_id, fk_data_entity_id = data_entity_id, pk_data_field_name = data_field_name)
  den %>%
    dplyr::select(data_entity_id, entity_name) %>%
    dplyr::left_join( dfl_pk, by = c("data_entity_id" = "fk_data_entity_id")) %>%
    dplyr::select(data_entity_id, fk_data_entity_name = entity_name, fk_data_field_name = pk_data_field_name) 
}
den_pk = get_den_pk(den, dfl)
den_pk
## # A tibble: 3 x 3
##   data_entity_id fk_data_entity_name fk_data_field_name
##            <int>               <chr>              <chr>
## 1              1          enum_value      enum_value_id
## 2              2            enum_var        enum_var_id
## 3              3               place           place_id
get_dfl_fk = function(den, dfl) {
  dfl_fk = dfl %>%
    dplyr::filter(pk_fk == "FK") %>%
    dplyr::select(-pk_fk) %>%
    dplyr::left_join(get_den_pk(den, dfl), by = c("fk_data_entity_id" = "data_entity_id"))
}
dfl_fk = get_dfl_fk(den, dfl)
dfl_fk
## # A tibble: 2 x 8
##   data_field_id data_field_name  type data_entity_id fk_data_entity_id enum_var_name fk_data_entity_name fk_data_field_name
##           <int>           <chr> <chr>          <int>             <int>         <chr>               <chr>              <chr>
## 1             2     enum_var_id   INT              1                 2          <NA>            enum_var        enum_var_id
## 2             9      place_enum   INT              3                 1    place_enum          enum_value      enum_value_id

Check if all fk fields are set properly:

dfl_fk %>% dplyr::filter(is.na(fk_data_entity_id))
## # A tibble: 0 x 8
## # ... with 8 variables: data_field_id <int>, data_field_name <chr>, type <chr>, data_entity_id <int>, fk_data_entity_id <int>, enum_var_name <chr>, fk_data_entity_name <chr>, fk_data_field_name <chr>

Set up a data.table that will store CREATE TABLE sql statements:

sql = data.table::data.table(
                             data_entity_id = den$data_entity_id,
                             sql_create_table = ""
                             ) %>%
  data.table::setkey(data_entity_id)
ids = dplyr::distinct(dfl, data_entity_id)$data_entity_id

We will loop over each data_entity_id of all data fields dfl:

  for (deid in ids) {
    fd = dplyr::filter(dfl, data_entity_id == deid) %>%
      dplyr::left_join(den, by = "data_entity_id")
    table_name = unique(fd$entity_name)
    nonpks = dplyr::filter(fd, pk_fk != "PK")
    pks = dplyr::filter(fd, pk_fk == "PK")
    fks = dplyr::filter(fd, pk_fk == "FK") %>%
      dplyr::left_join(den_pk, by = "data_entity_id")
    sql[deid]$sql_create_table = create_table_sql( table_name, non_keys, pks, fks ) 
  }

Run the loop for deid = 1 as an example:

deid = 1
fd = dplyr::filter(dfl, data_entity_id == deid) %>%
  dplyr::left_join(den, by = "data_entity_id")
fd
## # A tibble: 3 x 8
##   data_field_id data_field_name  type   pk_fk data_entity_id fk_data_entity_id enum_var_name entity_name
##           <int>           <chr> <chr>   <chr>          <int>             <int>         <chr>       <chr>
## 1             1   enum_value_id   INT      PK              1                NA          <NA>  enum_value
## 2             2     enum_var_id   INT      FK              1                 2          <NA>  enum_value
## 3             3 enum_value_name  TEXT NON_KEY              1                NA          <NA>  enum_value

fd has the data fields of the current data entity among the ones we loop over:

table_name = unique(fd$entity_name)
non_keys = dplyr::filter(fd, pk_fk == "NON_KEY")
non_keys
## # A tibble: 1 x 8
##   data_field_id data_field_name  type   pk_fk data_entity_id fk_data_entity_id enum_var_name entity_name
##           <int>           <chr> <chr>   <chr>          <int>             <int>         <chr>       <chr>
## 1             3 enum_value_name  TEXT NON_KEY              1                NA          <NA>  enum_value
pks = dplyr::filter(fd, pk_fk == "PK")
pks
## # A tibble: 1 x 8
##   data_field_id data_field_name  type pk_fk data_entity_id fk_data_entity_id enum_var_name entity_name
##           <int>           <chr> <chr> <chr>          <int>             <int>         <chr>       <chr>
## 1             1   enum_value_id   INT    PK              1                NA          <NA>  enum_value
fks = dplyr::filter(fd, pk_fk == "FK") %>%
  dplyr::left_join(den_pk, by = c( "fk_data_entity_id" = "data_entity_id"))
fks
## # A tibble: 1 x 10
##   data_field_id data_field_name  type pk_fk data_entity_id fk_data_entity_id enum_var_name entity_name fk_data_entity_name fk_data_field_name
##           <int>           <chr> <chr> <chr>          <int>             <int>         <chr>       <chr>               <chr>              <chr>
## 1             2     enum_var_id   INT    FK              1                 2          <NA>  enum_value            enum_var        enum_var_id

Actual job of producing CREATE TABLE statements are done by create_table_sql() function:

Definition of create_table_sql() is here:

create_table_sql = function( table_name, non_keys, pks, fks ) {
  template_pk = "%s BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY"
  sql_pk = sprintf(template_pk, pks$data_field_name)
  template_fk = "%s int REFERENCES %s (%s)"
  sql_fk = sprintf(template_fk, fks$data_field_name, fks$fk_data_entity_name, fks$fk_data_field_name)
  template_nonkey = "%s %s"
  sql_nonkey = sprintf(template_nonkey, non_keys$data_field_name, non_keys$type)
  column_sql = c(sql_pk, sql_fk, sql_nonkey)
  column_names = column_sql %>% paste(collapse=", ")
  template = "CREATE TABLE %s (%s);"
  create_table = sprintf( template, table_name, column_names)
  return(create_table)
}
sql[deid]$sql_create_table = create_table_sql( table_name, non_keys, pks, fks )

yumltordbschema::create_table_sql()

We have different templates for different types of data fields: PK fields, FK fields, non-key fields. These data fields are given as input to the function: non_keys, pks, fks

template_pk = "%s BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY"
sql_pk = sprintf(template_pk, pks$data_field_name)
sql_pk
## [1] "enum_value_id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY"
template_fk = "%s int REFERENCES %s (%s)"
sql_fk = sprintf(template_fk, fks$data_field_name, fks$fk_data_entity_name, fks$fk_data_field_name)
sql_fk
## [1] "enum_var_id int REFERENCES enum_var (enum_var_id)"
template_nonkey = "%s %s"
sql_nonkey = sprintf(template_nonkey, non_keys$data_field_name, non_keys$type)
sql_nonkey
## [1] "enum_value_name TEXT"

Now, we can concatenate all field definitions:

column_sql = c(sql_pk, sql_fk, sql_nonkey)
column_names = column_sql %>% paste(collapse=", ")
template = "CREATE TABLE %s (%s);"
create_table = sprintf( template, table_name, column_names)
create_table
## [1] "CREATE TABLE enum_value (enum_value_id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY, enum_var_id int REFERENCES enum_var (enum_var_id), enum_value_name TEXT);"

Run the function at once:

create_table_sql( table_name, non_keys, pks, fks )
## [1] "CREATE TABLE enum_value (enum_value_id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY, enum_var_id int REFERENCES enum_var (enum_var_id), enum_value_name TEXT);"

Why Can’t We Use apply instead of for loop?

In order to use apply(), we first need to wrap the body of for loop inside a function:

create_sql = function(fd) {
  table_name = unique(fd$entity_name)
  nonpks = dplyr::filter(fd, pk_fk != "PK")
  pks = dplyr::filter(fd, pk_fk == "PK")
  yumltordbschema::create_table_sql( 
    table_name, nonpks$data_field_name, nonpks$type, pks$data_field_name )
}

Now, we could have called the function like that:

apply(dfl, MARGIN = 1, create_sql)

But this will not work as expected, because in the above for loop we loop per each distinct data_entity_aid:

ids = dplyr::distinct(dfl, data_entity_aid)$data_entity_aid
for (deid in ids) {
  fd = dplyr::filter(dfl, data_entity_aid == deid)
  ...
}

In order to use apply() function, we need to ensure that each row of dfl contains a distinct data_entity_aid.

Can I replace for loop with lapply?

Instead of for loop, I can use lapply here:

for (deid in ids) {
  fd = dplyr::filter(dfl, data_entity_id == deid) %>%
    dplyr::left_join(den, by = "data_entity_id")
  table_name = unique(fd$entity_name)
  nonpks = dplyr::filter(fd, pk_fk != "PK")
  pks = dplyr::filter(fd, pk_fk == "PK")
  fks = dplyr::filter(fd, pk_fk == "FK") %>%
    dplyr::left_join(den_pk, by = "data_entity_id")
  sql[deid]$sql_create_table = create_table_sql( 
    table_name, nonpks$data_field_name, nonpks$type, pks$data_field_name, fks )
}

It will be something like:

loop_sql_create_table = function(deid, dfl, den) {
  fd = dplyr::filter(dfl, data_entity_id == deid) %>%
    dplyr::left_join(den, by = "data_entity_id")
  table_name = unique(fd$entity_name)
  nonpks = dplyr::filter(fd, pk_fk != "PK")
  pks = dplyr::filter(fd, pk_fk == "PK")
  fks = dplyr::filter(fd, pk_fk == "FK") %>%
  dplyr::left_join(get_den_pk(den, dfl), by = "data_entity_id")
  return(tibble::tibble(
                        data_entity_id = deid
                        , sql_create_table = create_table_sql( table_name, non_keys, pks, fks )
                        )
         )
}
sql = lapply(ids, loop_sql_create_table, dfl, den) %>%
  dplyr::bind_rows()
sql
## # A tibble: 3 x 2
##   data_entity_id                                                                                                                                                          sql_create_table
##            <int>                                                                                                                                                                     <chr>
## 1              1 CREATE TABLE enum_value (enum_value_id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY, enum_var_id int REFERENCES enum_value (enum_value_id), enum_value_name TEXT);
## 2              2                                                            CREATE TABLE enum_var (enum_var_id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY, enum_value_name TEXT);
## 3              3                      CREATE TABLE place (place_id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY, place_enum int REFERENCES place (place_id), enum_value_name TEXT);

Which one is better?

I think for loop is easier to read and understand. But even so, I choose lapply over for loop because of composability. Having a side-effect-free function allows us to grow the program by composing building blocks.

 Tech    20 Jan, 2018

Any work (images, writings, presentations, ideas or whatever) which I own is always provided under
Creative Commons License Creative Commons Attribution-Share Alike 3.0 License

Mert Nuhoglu is a Trabzon-born programmer and data scientist.

You may also like...