I wanted to automate the process from data modelling to testing it with sample data in actual database.

Table of Contents

library(dplyr, warn.conflicts = F)
data_model_dir = "/Users/mertnuhoglu/projects/itr/vrp_doc/data_model/"

Implementation of yumltordbschema::yuml_to_rdb() function

yuml_to_rdb() takes as input yuml formatted data model and converts it into normalized structural data in RDB style.

Example input:

[enum_value| enum_value_id PK; enum_var_id; enum_value_name TEXT; ]
[enum_var| enum_var_id PK; enum_var_name TEXT; ]
[enum_var] 1-n [enum_value]

Final output:

data_entity.tsv
| data_entity_id | entity_name |
| 7              | enum_value  |
| 8              | enum_var    |

data_field.tsv
| data_field_id | data_field_name | type   | pk_fk   | data_entity_id | fk_data_entity_id | enum_var_name |
| 26            | enum_value_id   | BIGINT | PK      | 7              |                   |               |
| 27            | enum_var_id     | BIGINT | FK      | 7              | 8                 |               |
| 28            | enum_value_name | TEXT   | NOT_KEY | 7              |                   |               |
| 29            | enum_var_id     | BIGINT | PK      | 8              |                   |               |
| 30            | enum_var_name   | TEXT   | NOT_KEY | 8              |                   |               |
yuml_to_rdb = function(data_model_dir) {
  build_datamodel_sdb.sh = system.file("bash/build_datamodel_sdb.sh", package = "yumltordbschema")
  system2(build_datamodel_sdb.sh, data_model_dir)

	yuml_lines = r_datamodel_sdb(data_model_dir) %>% 
    rutils::grepv("\\|")
	ydm = build_yuml_data_model(yuml_lines)
	den = update_new_entities(ydm)
  dfl = update_new_fields(ydm, den)

  write_yuml_data_model(ydm, data_model_dir = data_model_dir)
  write_data_entity(den, data_model_dir)
  write_data_field(dfl, data_model_dir)
  return(list(data_entity = den, data_field = dfl))
}

Calling yuml_to_rdb requires the parameter data_model_dir which is the path of the yuml files.

data_model_dir =  "/Users/mertnuhoglu/projects/itr/vrp_doc/data_model/"
rdt = yuml_to_rdb(data_model_dir)

build_datamodel_sdb.sh script: Concatenate yuml files and fix them

@tbd

build_yuml_data_model() function: Convert free format yuml content into structural data

Input: The unified and fixed yuml file: datamodel_sdb.yuml

Output: Structural data: yuml_data_model.csv

lines = c("[enum_value| enum_value_id INT PK; enum_var_id INT FK @NN; enum_value_name TEXT; ]",
  "[enum_var| enum_var_id INT PK; enum_var_name TEXT; ]")
lines
  ## [1] "[enum_value| enum_value_id INT PK; enum_var_id INT FK @NN; enum_value_name TEXT; ]"
  ## [2] "[enum_var| enum_var_id INT PK; enum_var_name TEXT; ]"
ydm_p1 = lines %>%
  stringr::str_replace_all( "^[ \\[]*", "" ) %>%
  stringr::str_replace_all( "[ \\]]*$", "" ) %>%
  stringr::str_replace_all( "\\|\\s*", "\\|" ) %>%
  stringr::str_replace_all( ";\\s*", ";" ) 
ydm_p1 
  ## [1] "enum_value|enum_value_id INT PK;enum_var_id INT FK @NN;enum_value_name TEXT;"
  ## [2] "enum_var|enum_var_id INT PK;enum_var_name TEXT;"
ydm_p2 = ydm_p1 %>%
  dplyr::data_frame( ln = . ) 
ydm_p2 
  ## # A tibble: 2 x 1
  ##                                                                             ln
  ##                                                                          <chr>
  ## 1 enum_value|enum_value_id INT PK;enum_var_id INT FK @NN;enum_value_name TEXT;
  ## 2                              enum_var|enum_var_id INT PK;enum_var_name TEXT;
ydm_p3 = ydm_p2 %>%
  tidyr::separate( ln, c("entity_name", "columns"), "\\|" ) %>%
  tibble::glimpse()
ydm_p4 = ydm_p3 %>%
  tidyr::unnest( columns = strsplit( columns, ";") ) 
ydm_p4 
  ## # A tibble: 5 x 2
  ##   entity_name                columns
  ##         <chr>                  <chr>
  ## 1  enum_value   enum_value_id INT PK
  ## 2  enum_value enum_var_id INT FK @NN
  ## 3  enum_value   enum_value_name TEXT
  ## 4    enum_var     enum_var_id INT PK
  ## 5    enum_var     enum_var_name TEXT
ydm_p5 = ydm_p4 %>%
  tidyr::separate( columns, c("data_field_name", "type", "pk_fk"), " " ) %>%
  dplyr::mutate( pk_fk = ifelse( is.na(pk_fk), "NON_KEY", pk_fk)) %>%
  dplyr::mutate( type = ifelse( is.na(type), "TEXT", type)) %>%
  dplyr::mutate( entity_name = tolower(entity_name) ) %>%
  dplyr::mutate( data_field_name = tolower(data_field_name) ) 
ydm_p5 
  ## # A tibble: 5 x 4
  ##   entity_name data_field_name  type   pk_fk
  ##         <chr>           <chr> <chr>   <chr>
  ## 1  enum_value   enum_value_id   INT      PK
  ## 2  enum_value     enum_var_id   INT      FK
  ## 3  enum_value enum_value_name  TEXT NON_KEY
  ## 4    enum_var     enum_var_id   INT      PK
  ## 5    enum_var   enum_var_name  TEXT NON_KEY

Handling new field attribute @NN in yuml

I noted that ydm_p4 manipulation gives warning: “Warning: Too many values at 2 locations: 2”

Checking why this warning is issued:

ydm_p4[2, ]
  ## # A tibble: 1 x 2
  ##   entity_name                columns
  ##         <chr>                  <chr>
  ## 1  enum_value enum_var_id INT FK @NN

These two fields have an additional attribute as @NN, but tidyr::separate( columns, c("data_field_name", "type", "pk_fk"), " " ) expects only 3 items.

This @NN attribute comes from original yuml file:

[enum_value| enum_value_id PK; enum_var_id @NN; enum_value_name TEXT; ]

The following tidyr::separate function call will parse additional @NN attribute:

ydm_p6 = ydm_p4 %>%
  tidyr::separate( columns, c("data_field_name", "type", "pk_fk", "not_null"), " " ) %>%
  dplyr::mutate( pk_fk = ifelse( is.na(pk_fk), "NON_KEY", pk_fk)) %>%
  dplyr::mutate( type = ifelse( is.na(type), "TEXT", type)) %>%
  dplyr::mutate( not_null = ifelse( is.na(not_null), FALSE, TRUE)) %>%
  dplyr::mutate( entity_name = tolower(entity_name) ) %>%
  dplyr::mutate( data_field_name = tolower(data_field_name) ) 
ydm_p6
  ## # A tibble: 5 x 5
  ##   entity_name data_field_name  type   pk_fk not_null
  ##         <chr>           <chr> <chr>   <chr>    <lgl>
  ## 1  enum_value   enum_value_id   INT      PK    FALSE
  ## 2  enum_value     enum_var_id   INT      FK     TRUE
  ## 3  enum_value enum_value_name  TEXT NON_KEY    FALSE
  ## 4    enum_var     enum_var_id   INT      PK    FALSE
  ## 5    enum_var   enum_var_name  TEXT NON_KEY    FALSE

No, need to make any changes in update_new_entities and update_new_fields

ydm = ydm_p6
den = yumltordbschema::update_new_entities(ydm)
den
  ## # A tibble: 2 x 2
  ##   data_entity_id entity_name
  ##            <int>       <chr>
  ## 1              1  enum_value
  ## 2              2    enum_var
dfl = yumltordbschema::update_new_fields(ydm, den)
dfl
  ## # A tibble: 5 x 8
  ##   data_field_id data_field_name  type   pk_fk not_null data_entity_id fk_data_entity_id enum_var_name
  ##           <int>           <chr> <chr>   <chr>    <lgl>          <int>             <int>         <chr>
  ## 1             1   enum_value_id   INT      PK    FALSE              1                NA          <NA>
  ## 2             2     enum_var_id   INT      FK     TRUE              1                 2          <NA>
  ## 3             3 enum_value_name  TEXT NON_KEY    FALSE              1                NA          <NA>
  ## 4             4     enum_var_id   INT      PK    FALSE              2                NA          <NA>
  ## 5             5   enum_var_name  TEXT NON_KEY    FALSE              2                NA          <NA>