Convert yuml Formatted Data Model into RDB Style Structured Data

1378 words · 7 min read

options(width = 150)
options(max.print = 30)
library(dplyr, warn.conflicts = F)
data_model_dir = "/Users/mertnuhoglu/projects/itr/itr_documentation/data_model/"

Implementation of yumltordbschema::build_rdb_data() function

build_rdb_data() takes as input yuml formatted data model and converts it into normalized structural data in RDB style.

Example input:

[enum_value| enum_value_id PK; enum_var_id; enum_value_name TEXT; ]
[enum_var| enum_var_id PK; enum_var_name TEXT; ]
[enum_var] 1-n [enum_value]

Final output:

data_entity.tsv
| data_entity_id | entity_name |
| 7              | enum_value  |
| 8              | enum_var    |

data_field.tsv
| data_field_id | data_field_name | type   | pk_fk   | data_entity_id | fk_data_entity_id | enum_var_name |
| 26            | enum_value_id   | BIGINT | PK      | 7              |                   |               |
| 27            | enum_var_id     | BIGINT | FK      | 7              | 8                 |               |
| 28            | enum_value_name | TEXT   | NOT_KEY | 7              |                   |               |
| 29            | enum_var_id     | BIGINT | PK      | 8              |                   |               |
| 30            | enum_var_name   | TEXT   | NOT_KEY | 8              |                   |               |

build_datamodel_sdb.sh script: Concatenate yuml files and fix them

@tbd

build_yuml_data_model() function: Convert free format yuml content into structural data

Input: The unified and fixed yuml file: datamodel_sdb.yuml

Output: Structural data: yuml_data_model.csv

lines = c("[enum_value| enum_value_id INT PK; enum_var_id INT FK @NN; enum_value_name TEXT; ]",
  "[enum_var| enum_var_id INT PK; enum_var_name TEXT; ]")
lines
## [1] "[enum_value| enum_value_id INT PK; enum_var_id INT FK @NN; enum_value_name TEXT; ]"
## [2] "[enum_var| enum_var_id INT PK; enum_var_name TEXT; ]"
ydm_p1 = lines %>%
  stringr::str_replace_all( "^[ \\[]*", "" ) %>%
  stringr::str_replace_all( "[ \\]]*$", "" ) %>%
  stringr::str_replace_all( "\\|\\s*", "\\|" ) %>%
  stringr::str_replace_all( ";\\s*", ";" ) 
ydm_p1 
## [1] "enum_value|enum_value_id INT PK;enum_var_id INT FK @NN;enum_value_name TEXT;"
## [2] "enum_var|enum_var_id INT PK;enum_var_name TEXT;"
ydm_p2 = ydm_p1 %>%
  dplyr::data_frame( ln = . ) 
ydm_p2 
## # A tibble: 2 x 1
##                                                                             ln
##                                                                          <chr>
## 1 enum_value|enum_value_id INT PK;enum_var_id INT FK @NN;enum_value_name TEXT;
## 2                              enum_var|enum_var_id INT PK;enum_var_name TEXT;
ydm_p3 = ydm_p2 %>%
  tidyr::separate( ln, c("entity_name", "columns"), "\\|" ) %>%
  tibble::glimpse()
## Observations: 2
## Variables: 2
## $ entity_name <chr> "enum_value", "enum_var"
## $ columns     <chr> "enum_value_id INT PK;enum_var_id INT FK @NN;enum_value_name TEXT;", "enum_var_id INT PK;enum_var_name TEXT;"
ydm_p4 = ydm_p3 %>%
  tidyr::unnest( columns = strsplit( columns, ";") ) 
ydm_p4 
## # A tibble: 5 x 2
##   entity_name                columns
##         <chr>                  <chr>
## 1  enum_value   enum_value_id INT PK
## 2  enum_value enum_var_id INT FK @NN
## 3  enum_value   enum_value_name TEXT
## 4    enum_var     enum_var_id INT PK
## 5    enum_var     enum_var_name TEXT
ydm_p5 = ydm_p4 %>%
  tidyr::separate( columns, c("data_field_name", "type", "pk_fk"), " " ) %>%
  dplyr::mutate( pk_fk = ifelse( is.na(pk_fk), "NON_KEY", pk_fk)) %>%
  dplyr::mutate( type = ifelse( is.na(type), "TEXT", type)) %>%
  dplyr::mutate( entity_name = tolower(entity_name) ) %>%
  dplyr::mutate( data_field_name = tolower(data_field_name) ) 
## Warning: Too many values at 1 locations: 2
## Warning: Too few values at 2 locations: 3, 5
ydm_p5 
## # A tibble: 5 x 4
##   entity_name data_field_name  type   pk_fk
##         <chr>           <chr> <chr>   <chr>
## 1  enum_value   enum_value_id   INT      PK
## 2  enum_value     enum_var_id   INT      FK
## 3  enum_value enum_value_name  TEXT NON_KEY
## 4    enum_var     enum_var_id   INT      PK
## 5    enum_var   enum_var_name  TEXT NON_KEY

Handling new field attribute @NN in yuml

I noted that ydm_p4 manipulation gives warning: “Warning: Too many values at 2 locations: 2”

Checking why this warning is issued:

ydm_p4[2, ]
## # A tibble: 1 x 2
##   entity_name                columns
##         <chr>                  <chr>
## 1  enum_value enum_var_id INT FK @NN

These two fields have an additional attribute as @NN, but tidyr::separate( columns, c("data_field_name", "type", "pk_fk"), " " ) expects only 3 items.

This @NN attribute comes from original yuml file:

[enum_value| enum_value_id PK; enum_var_id @NN; enum_value_name TEXT; ]

The following tidyr::separate function call will parse additional @NN attribute:

ydm_p6 = ydm_p4 %>%
  tidyr::separate( columns, c("data_field_name", "type", "pk_fk", "not_null"), " " ) %>%
  dplyr::mutate( pk_fk = ifelse( is.na(pk_fk), "NON_KEY", pk_fk)) %>%
  dplyr::mutate( type = ifelse( is.na(type), "TEXT", type)) %>%
  dplyr::mutate( not_null = ifelse( is.na(not_null), FALSE, TRUE)) %>%
  dplyr::mutate( entity_name = tolower(entity_name) ) %>%
  dplyr::mutate( data_field_name = tolower(data_field_name) ) 
## Warning: Too few values at 4 locations: 1, 3, 4, 5
ydm_p6
## # A tibble: 5 x 5
##   entity_name data_field_name  type   pk_fk not_null
##         <chr>           <chr> <chr>   <chr>    <lgl>
## 1  enum_value   enum_value_id   INT      PK    FALSE
## 2  enum_value     enum_var_id   INT      FK     TRUE
## 3  enum_value enum_value_name  TEXT NON_KEY    FALSE
## 4    enum_var     enum_var_id   INT      PK    FALSE
## 5    enum_var   enum_var_name  TEXT NON_KEY    FALSE

No, need to make any changes in update_new_entities and update_new_fields

ydm = ydm_p6
den = yumltordbschema::update_new_entities(ydm)
den
## # A tibble: 2 x 2
##   data_entity_id entity_name
##            <int>       <chr>
## 1              1  enum_value
## 2              2    enum_var
dfl = yumltordbschema::update_new_fields(ydm, den)
dfl
## # A tibble: 5 x 8
##   data_field_id data_field_name  type   pk_fk not_null data_entity_id fk_data_entity_id enum_var_name
##           <int>           <chr> <chr>   <chr>    <lgl>          <int>             <int>         <chr>
## 1             1   enum_value_id   INT      PK    FALSE              1                NA          <NA>
## 2             2     enum_var_id   INT      FK     TRUE              1                 2          <NA>
## 3             3 enum_value_name  TEXT NON_KEY    FALSE              1                NA          <NA>
## 4             4     enum_var_id   INT      PK    FALSE              2                NA          <NA>
## 5             5   enum_var_name  TEXT NON_KEY    FALSE              2                NA          <NA>
build_ddl = function(data_entity, data_field) {
  get_den_pk = function(den, dfl) {
    dfl_pk = dfl %>%
      dplyr::filter(pk_fk == "PK") %>%
      dplyr::select(-pk_fk) %>%
      dplyr::select(pk_data_field_id = data_field_id, fk_data_entity_id = data_entity_id, pk_data_field_name = data_field_name)
    den %>%
      dplyr::select(data_entity_id, entity_name) %>%
      dplyr::left_join( dfl_pk, by = c("data_entity_id" = "fk_data_entity_id")) %>%
      dplyr::select(data_entity_id, fk_data_entity_name = entity_name, fk_data_field_name = pk_data_field_name) 
  }
  get_dfl_fk = function(den, dfl) {
    dfl_fk = dfl %>%
      dplyr::filter(pk_fk == "FK") %>%
      dplyr::select(-pk_fk) %>%
      dplyr::left_join(get_den_pk(den, dfl), by = c("fk_data_entity_id" = "data_entity_id"))
  }
  den = data_entity 
  dfl = data_field 
  den_pk = get_den_pk(den, dfl)
  sql = data.table::data.table(
                               data_entity_id = den$data_entity_id,
                               sql_create_table = ""
                               ) %>%
    data.table::setkey(data_entity_id)
  ids = dplyr::distinct(dfl, data_entity_id)$data_entity_id
  for (deid in ids) {
    fd = dplyr::filter(dfl, data_entity_id == deid) %>%
      dplyr::left_join(den, by = "data_entity_id")
    table_name = unique(fd$entity_name)
    non_keys = dplyr::filter(fd, pk_fk == "NON_KEY")
    pks = dplyr::filter(fd, pk_fk == "PK")
    fks = dplyr::filter(fd, pk_fk == "FK") %>%
      dplyr::left_join(den_pk, by = c( "fk_data_entity_id" = "data_entity_id" ))
    sql[deid]$sql_create_table = create_table_sql( table_name, non_keys, pks, fks )
  }
  return(sql)
}
create_table_sql = function(table_name, non_keys, pks, fks) {
  template_pk = "%s BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY"
  sql_pk = sprintf(template_pk, pks$data_field_name)
  template_fk = "%s BIGINT REFERENCES %s (%s)"
  sql_fk = sprintf(template_fk, fks$data_field_name, fks$fk_data_entity_name, fks$fk_data_field_name)
  template_nonkey = "%s %s"
  sql_nonkey = sprintf(template_nonkey, non_keys$data_field_name, non_keys$type)
  column_sql = c(sql_pk, sql_fk, sql_nonkey)
  column_names = column_sql %>% paste(collapse=", ")
    template = "CREATE TABLE %s (%s);"
    create_table = sprintf( template, table_name, column_names)
    return(create_table)
}
ddl = build_ddl( data_entity = den, data_field = dfl)
ddl$sql_create_table
## [1] "CREATE TABLE enum_value (enum_value_id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY, enum_var_id BIGINT REFERENCES enum_var (enum_var_id), enum_value_name TEXT);"
## [2] "CREATE TABLE enum_var (enum_var_id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY, enum_var_name TEXT);"

I want to add NOT NULL to this sql statement.

SQL statements are produced in create_table_sql function. Here, we have 3 different sql templates for 3 types of data fields, as follows:

create_table_sql = function(table_name, non_keys, pks, fks) {
  template_pk = "%s BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY"
  sql_pk = sprintf(template_pk, pks$data_field_name)
  template_fk = "%s BIGINT REFERENCES %s (%s)"
  sql_fk = sprintf(template_fk, fks$data_field_name, fks$fk_data_entity_name, fks$fk_data_field_name)
  template_nonkey = "%s %s"
  sql_nonkey = sprintf(template_nonkey, non_keys$data_field_name, non_keys$type)
  ...

NOT NULL idiom should be added to the sql templates of FK and non-key fields.

create_table_sql = function(table_name, non_keys, pks, fks) {
  template_pk = "%s BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY"
  sql_pk = sprintf(template_pk
                   , pks$data_field_name
                   )
  template_fk = "%s BIGINT %s REFERENCES %s (%s)"
  sql_fk = sprintf(template_fk
                   , fks$data_field_name
                   , ifelse(fks$not_null == TRUE, "NOT NULL", "")
                   , fks$fk_data_entity_name
                   , fks$fk_data_field_name
                   )
  template_nonkey = "%s %s %s"
  sql_nonkey = sprintf(template_nonkey
                       , non_keys$data_field_name
                       , non_keys$type
                       , ifelse(non_keys$not_null == TRUE, "NOT NULL", "")
                       )
  column_sql = c(sql_pk, sql_fk, sql_nonkey)
  column_names = column_sql %>% paste(collapse=", ")
    template = "CREATE TABLE %s (%s);"
    create_table = sprintf( template, table_name, column_names)
    return(create_table)
}

Now, lets test this. First prepare the data to pass to create_table_sql

ddl = build_ddl( data_entity = den, data_field = dfl)
ddl$sql_create_table
## [1] "CREATE TABLE enum_value (enum_value_id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY, enum_var_id BIGINT NOT NULL REFERENCES enum_var (enum_var_id), enum_value_name TEXT );"
## [2] "CREATE TABLE enum_var (enum_var_id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY, enum_var_name TEXT );"

The resulting sql contains NOT NULL as expected:

enum_var_id BIGINT NOT NULL REFERENCES enum_var (enum_var_id),

 Tech    20 Jan, 2018

Any work (images, writings, presentations, ideas or whatever) which I own is always provided under
Creative Commons License Creative Commons Attribution-Share Alike 3.0 License

Mert Nuhoglu is a Trabzon-born programmer and data scientist.

You may also like...