Generate Sample Data from DDL

1440 words · 7 min read

--- title: "Generate Sample Data from DDL" date: 2018-01-19T18:30:12+03:00 draft: false description: "" tags: categories: sql, datafiller type: post url: author: "Mert Nuhoglu" output: rmarkdown::html_document blog: mertnuhoglu.com resource_files: - data/ddl_to_data_schema_01.sql - data/ddl_to_data_data_01.sql - data/ddl_to_data_schema_02.sql - data/ddl_to_data_data_02.sql - data/ddl_to_data_schema_03.sql - data/ddl_to_data_data_03.sql - data/ddl_to_data_schema_04.sql - data/ddl_to_data_data_04.sql - data/ddl_to_data_schema_05.sql - data/ddl_to_data_data_05.sql - data/ddl_to_data_place_enum.txt - data/ddl_to_data_schema_06.sql - data/ddl_to_data_data_06.sql - data/ddl_to_data_schema_07.sql - data/ddl_to_data_data_07.sql - data/ddl_to_data_action_enum.txt - data/ddl_to_data_enum_value_id.txt - data/ddl_to_data_place_enum.txt path: ~/projects/study/problem/data_generation/ddl_to_data.Rmd --- --- title: "Generate Sample Data from DDL" date: 2018-01-19T18:30:12+03:00 draft: false description: "" tags: categories: sql, datafiller type: post url: author: "Mert Nuhoglu" output: rmarkdown::html_document blog: mertnuhoglu.com resource_files: - data/ddl_to_data_schema_01.sql - data/ddl_to_data_data_01.sql - data/ddl_to_data_schema_02.sql - data/ddl_to_data_data_02.sql - data/ddl_to_data_schema_03.sql - data/ddl_to_data_data_03.sql - data/ddl_to_data_schema_04.sql - data/ddl_to_data_data_04.sql - data/ddl_to_data_schema_05.sql - data/ddl_to_data_data_05.sql - data/ddl_to_data_place_enum.txt - data/ddl_to_data_schema_06.sql - data/ddl_to_data_data_06.sql - data/ddl_to_data_schema_07.sql - data/ddl_to_data_data_07.sql - data/ddl_to_data_action_enum.txt - data/ddl_to_data_enum_value_id.txt - data/ddl_to_data_place_enum.txt path: ~/projects/study/problem/data_generation/ddl_to_data.Rmd --- --- title: "Generate Sample Data from DDL" date: 2018-01-19T18:30:12+03:00 draft: false description: "" tags: categories: sql, datafiller type: post url: author: "Mert Nuhoglu" output: rmarkdown::html_document blog: mertnuhoglu.com resource_files: - data/ddl_to_data_schema_01.sql - data/ddl_to_data_data_01.sql - data/ddl_to_data_schema_02.sql - data/ddl_to_data_data_02.sql - data/ddl_to_data_schema_03.sql - data/ddl_to_data_data_03.sql - data/ddl_to_data_schema_04.sql - data/ddl_to_data_data_04.sql - data/ddl_to_data_schema_05.sql - data/ddl_to_data_data_05.sql - data/ddl_to_data_place_enum.txt - data/ddl_to_data_schema_06.sql - data/ddl_to_data_data_06.sql - data/ddl_to_data_schema_07.sql - data/ddl_to_data_data_07.sql - data/ddl_to_data_action_enum.txt - data/ddl_to_data_enum_value_id.txt - data/ddl_to_data_place_enum.txt path: ~/projects/study/problem/data_generation/ddl_to_data.Rmd --- --- title: "Generate Sample Data from DDL" date: 2018-01-19T18:30:12+03:00 draft: false description: "" tags: categories: sql, datafiller type: post url: author: "Mert Nuhoglu" output: rmarkdown::html_document blog: mertnuhoglu.com resource_files: - data/ddl_to_data_schema_01.sql - data/ddl_to_data_data_01.sql - data/ddl_to_data_schema_02.sql - data/ddl_to_data_data_02.sql - data/ddl_to_data_schema_03.sql - data/ddl_to_data_data_03.sql - data/ddl_to_data_schema_04.sql - data/ddl_to_data_data_04.sql - data/ddl_to_data_schema_05.sql - data/ddl_to_data_data_05.sql - data/ddl_to_data_place_enum.txt - data/ddl_to_data_schema_06.sql - data/ddl_to_data_data_06.sql - data/ddl_to_data_schema_07.sql - data/ddl_to_data_data_07.sql - data/ddl_to_data_action_enum.txt - data/ddl_to_data_enum_value_id.txt - data/ddl_to_data_place_enum.txt path: ~/projects/study/problem/data_generation/ddl_to_data.Rmd --- --- title: "Generate Sample Data from DDL" date: 2018-01-19T18:30:12+03:00 draft: false description: "" tags: categories: sql, datafiller type: post url: author: "Mert Nuhoglu" output: rmarkdown::html_document blog: mertnuhoglu.com resource_files: - data/ddl_to_data_schema_01.sql - data/ddl_to_data_data_01.sql - data/ddl_to_data_schema_02.sql - data/ddl_to_data_data_02.sql - data/ddl_to_data_schema_03.sql - data/ddl_to_data_data_03.sql - data/ddl_to_data_schema_04.sql - data/ddl_to_data_data_04.sql - data/ddl_to_data_schema_05.sql - data/ddl_to_data_data_05.sql - data/ddl_to_data_place_enum.txt - data/ddl_to_data_schema_06.sql - data/ddl_to_data_data_06.sql - data/ddl_to_data_schema_07.sql - data/ddl_to_data_data_07.sql - data/ddl_to_data_action_enum.txt - data/ddl_to_data_enum_value_id.txt - data/ddl_to_data_place_enum.txt path: ~/projects/study/problem/data_generation/ddl_to_data.Rmd --- --- title: "Generate Sample Data from DDL" date: 2018-01-19T18:30:12+03:00 draft: false description: "" tags: categories: sql, datafiller type: post url: author: "Mert Nuhoglu" output: rmarkdown::html_document blog: mertnuhoglu.com resource_files: - data/ddl_to_data_schema_01.sql - data/ddl_to_data_data_01.sql - data/ddl_to_data_schema_02.sql - data/ddl_to_data_data_02.sql - data/ddl_to_data_schema_03.sql - data/ddl_to_data_data_03.sql - data/ddl_to_data_schema_04.sql - data/ddl_to_data_data_04.sql - data/ddl_to_data_schema_05.sql - data/ddl_to_data_data_05.sql - data/ddl_to_data_place_enum.txt - data/ddl_to_data_schema_06.sql - data/ddl_to_data_data_06.sql - data/ddl_to_data_schema_07.sql - data/ddl_to_data_data_07.sql - data/ddl_to_data_action_enum.txt - data/ddl_to_data_enum_value_id.txt - data/ddl_to_data_place_enum.txt path: ~/projects/study/problem/data_generation/ddl_to_data.Rmd --- --- title: "Generate Sample Data from DDL" date: 2018-01-19T18:30:12+03:00 draft: false description: "" tags: categories: sql, datafiller type: post url: author: "Mert Nuhoglu" output: rmarkdown::html_document blog: mertnuhoglu.com resource_files: - data/ddl_to_data_schema_01.sql - data/ddl_to_data_data_01.sql - data/ddl_to_data_schema_02.sql - data/ddl_to_data_data_02.sql - data/ddl_to_data_schema_03.sql - data/ddl_to_data_data_03.sql - data/ddl_to_data_schema_04.sql - data/ddl_to_data_data_04.sql - data/ddl_to_data_schema_05.sql - data/ddl_to_data_data_05.sql - data/ddl_to_data_place_enum.txt - data/ddl_to_data_schema_06.sql - data/ddl_to_data_data_06.sql - data/ddl_to_data_schema_07.sql - data/ddl_to_data_data_07.sql - data/ddl_to_data_action_enum.txt - data/ddl_to_data_enum_value_id.txt - data/ddl_to_data_place_enum.txt path: ~/projects/study/problem/data_generation/ddl_to_data.Rmd --- --- title: "Generate Sample Data from DDL" date: 2018-01-19T18:30:12+03:00 draft: false description: "" tags: categories: sql, datafiller type: post url: author: "Mert Nuhoglu" output: rmarkdown::html_document blog: mertnuhoglu.com resource_files: - data/ddl_to_data_schema_01.sql - data/ddl_to_data_data_01.sql - data/ddl_to_data_schema_02.sql - data/ddl_to_data_data_02.sql - data/ddl_to_data_schema_03.sql - data/ddl_to_data_data_03.sql - data/ddl_to_data_schema_04.sql - data/ddl_to_data_data_04.sql - data/ddl_to_data_schema_05.sql - data/ddl_to_data_data_05.sql - data/ddl_to_data_place_enum.txt - data/ddl_to_data_schema_06.sql - data/ddl_to_data_data_06.sql - data/ddl_to_data_schema_07.sql - data/ddl_to_data_data_07.sql - data/ddl_to_data_action_enum.txt - data/ddl_to_data_enum_value_id.txt - data/ddl_to_data_place_enum.txt path: ~/projects/study/problem/data_generation/ddl_to_data.Rmd --- --- title: "Generate Sample Data from DDL" date: 2018-01-19T18:30:12+03:00 draft: false description: "" tags: categories: sql, datafiller type: post url: author: "Mert Nuhoglu" output: rmarkdown::html_document blog: mertnuhoglu.com resource_files: - data/ddl_to_data_schema_01.sql - data/ddl_to_data_data_01.sql - data/ddl_to_data_schema_02.sql - data/ddl_to_data_data_02.sql - data/ddl_to_data_schema_03.sql - data/ddl_to_data_data_03.sql - data/ddl_to_data_schema_04.sql - data/ddl_to_data_data_04.sql - data/ddl_to_data_schema_05.sql - data/ddl_to_data_data_05.sql - data/place_enum.txt path: ~/projects/study/problem/data_generation/ddl_to_data.Rmd ---

This document explains how to generate sample data for any database automatically from its database schema defined as DDL statements.

To do this, I use datafiller which is a command line tool that generates random data from database schemas described in DDL.

v01

Here is the first version of my database schema with embedded datafiller directives. In this first version, there is only one datafiller directive. It is in this line: CREATE TABLE Pln_Orl ( --df: mult=2.0

cat data/ddl_to_data_schema_01.sql
## CREATE TABLE Order_Line ( 
##   order_line_id INT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY,
##   purchase_order_id TEXT
##   );
## CREATE TABLE Plan (
##   plan_id TEXT,
##   title TEXT
##   );
## CREATE TABLE Pln_Orl (  --df: mult=2.0
##   plan_id TEXT,
##   order_line_id INT REFERENCES Order_Line (order_line_id) ON UPDATE CASCADE
##   );

df: mult=2.0 means to multiply number of samples to be generated for the corresponding entity with 2.0. When running datafiller we specify size of the default sample set. Here I specified it as --size=2, so datafiller will generate 4 rows of Pln_Orl.

datafiller --size=2 data/ddl_to_data_schema_01.sql > data/ddl_to_data_data_01.sql
cat data/ddl_to_data_data_01.sql | sed -n "/COPY/,+2 p"
## COPY order_line (order_line_id,purchase_order_id) FROM STDIN (ENCODING 'utf-8');
## 1    purchase
## 2    purchase
## COPY plan (plan_id,title) FROM STDIN (ENCODING 'utf-8');
## plan_id_1_   title_1_1_1_
## plan_id_1_   title_2_2_2_2_2_
## COPY pln_orl (plan_id,order_line_id) FROM STDIN (ENCODING 'utf-8');
## plan_id_4_   1
## plan_id_1_   1

v02

cat data/ddl_to_data_schema_02.sql
## CREATE TABLE order_line (
##   order_line_id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY
##   ,  shipment_date TIMESTAMPTZ
##   ,  irsaliye_kg TEXT
##   ,  tesis TEXT
##   ,  yas_toz BOOLEAN
##   ) ;
## CREATE TABLE plan (
##   plan_id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY
##   , title TEXT
##   ) ;
## CREATE TABLE pln_orl ( --df: mult=2.0
##   plan_id BIGINT REFERENCES plan (plan_id)
##   ,  order_line_id BIGINT REFERENCES order_line (order_line_id)
##   ) ;
datafiller --size=2 data/ddl_to_data_schema_02.sql > data/ddl_to_data_data_02.sql
cat data/ddl_to_data_data_02.sql | sed -n "/COPY/,+2 p"
## COPY order_line (shipment_date,irsaliye_kg,tesis,yas_toz) FROM STDIN (ENCODING 'utf-8');
## 2018-01-22 13:57:08  irsaliye_k  tesis_1_    FALSE
## 2018-01-22 13:57:08  irsaliye_k  tesis_2_2_2_    TRUE
## COPY plan (title) FROM STDIN (ENCODING 'utf-8');
## title_1_1_1_
## title_2_2_2_2_2_
## COPY pln_orl () FROM STDIN (ENCODING 'utf-8');

All BIGINT fields are left empty. It seems like datafiller does not support BIGINT properly.

v03

cat data/ddl_to_data_schema_03.sql
## CREATE TABLE order_line (
##   order_line_id INT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY
##   ,  shipment_date TIMESTAMPTZ
##   ,  irsaliye_kg TEXT
##   ,  tesis TEXT
##   ,  yas_toz BOOLEAN
##   ) ;
## CREATE TABLE plan (
##   plan_id INT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY
##   , title TEXT
##   ) ;
## CREATE TABLE pln_orl ( --df: mult=2.0
##   plan_id INT NOT NULL REFERENCES plan (plan_id)
##   ,  order_line_id INT NOT NULL REFERENCES order_line (order_line_id)
##   ) ;
datafiller --size=2 data/ddl_to_data_schema_03.sql > data/ddl_to_data_data_03.sql
cat data/ddl_to_data_data_03.sql | sed -n "/COPY/,+2 p"
## COPY order_line (order_line_id,shipment_date,irsaliye_kg,tesis,yas_toz) FROM STDIN (ENCODING 'utf-8');
## 1    2018-01-22 13:57:08 irsaliye_kg_2_  tesis_1_    TRUE
## 2    2018-01-22 13:57:08 irsaliye_k  tesis_2_2_2_    FALSE
## COPY plan (plan_id,title) FROM STDIN (ENCODING 'utf-8');
## 1    title_2_2_2_2_2_
## 2    title_2_2_2_2_2_
## COPY pln_orl (plan_id,order_line_id) FROM STDIN (ENCODING 'utf-8');
## 1    1
## 1    2

v04: Use Predefined Reference Ids as FK Data

Use the usual command word to reuse some predefined list of values:

-- df place_enum: word=./data/place_enum.txt
CREATE TABLE place (
  ...
  ,  place_enum INT
  -- df: text=place_enum length=1 
  ...

Note that, place_enum is not a FK according to the above definition.

Predefined reference ids are here:

cat data/place_enum.txt
## 1
## 2
## 3
## 4
## 5
cat data/ddl_to_data_schema_04.sql
## -- df place_enum: word=./data/place_enum.txt
## CREATE TABLE place (
##   place_id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY
##   ,  place_enum INT
##   -- df: text=place_enum length=1 
##   ,  address TEXT 
##   ) ;
datafiller --size=5 data/ddl_to_data_schema_04.sql > data/ddl_to_data_data_04.sql
cat data/ddl_to_data_data_04.sql | sed -n "/COPY/,+2 p"
## COPY place (place_enum,address) FROM STDIN (ENCODING 'utf-8');
## 2    address_1_
## 4    address_3_3_3

v05: Use Predefined Reference Ids together with REFERENCES clause

What if the referring column is defined as FK

-- df place_enum: word=./data/place_enum.txt
CREATE TABLE enum_value (
  enum_value_id INT 
  -- df: text=place_enum length=1 
  ) ;
CREATE TABLE place (
  ...
  ,  place_enum INT REFERENCES enum_value (enum_value_id)
  -- df: text=place_enum length=1 
  ...
cat data/ddl_to_data_schema_05.sql
## -- df place_enum: word=./data/place_enum.txt
## CREATE TABLE enum_value (
##   enum_value_id INT PRIMARY KEY
##   -- df: text=place_enum length=1 
##   ) ;
## CREATE TABLE place (
##   place_id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY
##   ,  place_enum INT  REFERENCES enum_value (enum_value_id)
##   -- df: text=place_enum length=1 
##   ,  address TEXT 
##   ) ;
datafiller --size=5 data/ddl_to_data_schema_05.sql > data/ddl_to_data_data_05.sql

This doesn’t work due to the following error:

## Traceback (most recent call last):
##   File "/usr/local/bin/datafiller", line 5067, in <module>
##     format(a.table.name, a.name, key.name)
## AssertionError: foreign key place.place_enum target enum_value_id must be unique

datafiller expects that enum_value$enum_value_id is unique when it is referenced by some FK.

cat data/ddl_to_data_data_05.sql | sed -n "/COPY/,+2 p"
## COPY enum_value (enum_value_id) FROM STDIN (ENCODING 'utf-8');
## 1
## 2
## COPY place (place_enum,address) FROM STDIN (ENCODING 'utf-8');
## 2    address_4_
## 4    address_5_5_5_5_

 Tech    19 Jan, 2018

Any work (images, writings, presentations, ideas or whatever) which I own is always provided under
Creative Commons License Creative Commons Attribution-Share Alike 3.0 License

Mert Nuhoglu is a Trabzon-born programmer and data scientist.

You may also like...