Data Generation and Recommendation Engine Using Association Rules (Market Basket) in R

library(dplyr, warn.conflicts = F)
library(dbplyr, warn.conflicts = F)

df1 = readr::read_csv("brand.csv")
df2 = readr::read_csv("product_category.csv")
df3 = readr::read_csv("version.csv")

df = merge(df1, df2, all = T) %>%
  merge(df3, all = T) %>%
  mutate( product_title = sprintf("%s %s %s", brand, product_category, version) %>% tolower ) 
writeLines(df$product_title, "product_title.txt")

cat schema.sql
  ## DROP TABLE IF EXISTS product CASCADE;
  ## CREATE TABLE product (  
  ##   product_id SERIAL PRIMARY KEY
  ##   , title TEXT NOT NULL
  ##   -- df product_title: word=product_title.txt
  ##   -- df: text=product_title length=1
  ## );
  ## DROP TABLE IF EXISTS purchase_order CASCADE;
  ## CREATE TABLE purchase_order (  --df: mult=0.1
  ##   purchase_order_id SERIAL PRIMARY KEY
  ## );
  ## DROP TABLE IF EXISTS order_line CASCADE;
  ## CREATE TABLE order_line (  --df: mult=0.5
  ##   product_id INTEGER NOT NULL REFERENCES product (product_id)
  ##   , purchase_order_id INTEGER NOT NULL REFERENCES purchase_order (purchase_order_id)
  ##   , amount INTEGER NOT NULL --df: offset=1 size=5
  ##   , PRIMARY KEY (product_id, purchase_order_id)
  ## );

datafiller --seed 1 --size=300 schema.sql > data.sql

cat data.sql | sed -n "/COPY/,+2 p"
  ## COPY product (product_id,title) FROM STDIN (ENCODING 'utf-8');
  ## 1    kia jant fk 4
  ## 2    dacia kaporta 392
  ## COPY purchase_order (purchase_order_id) FROM STDIN (ENCODING 'utf-8');
  ## 1
  ## 2
  ## COPY order_line (product_id,purchase_order_id,amount) FROM STDIN (ENCODING 'utf-8');
  ## 183  6   5
  ## 186  14  2

Now, we are going to start postgresql database server and import the generated data.

docker start postgreststarterkit_db_1

psql -d app -h localhost -p 5432 -U superuser -f init.sql
  ## psql:init.sql:1: NOTICE:  drop cascades to 3 other objects
  ## DETAIL:  drop cascades to table recommendation_engine.product
  ## drop cascades to table recommendation_engine.purchase_order
  ## drop cascades to table recommendation_engine.order_line
  ## DROP SCHEMA
  ## CREATE SCHEMA
  ## SET
  ## psql:schema.sql:1: NOTICE:  table "product" does not exist, skipping
  ## DROP TABLE
  ## CREATE TABLE
  ## psql:schema.sql:8: NOTICE:  table "purchase_order" does not exist, skipping
  ## DROP TABLE
  ## CREATE TABLE
  ## psql:schema.sql:12: NOTICE:  table "order_line" does not exist, skipping
  ## DROP TABLE
  ## CREATE TABLE
  ## # filling table product (300)
  ## COPY 300
  ## # filling table purchase_order (30)
  ## COPY 30
  ## # filling table order_line (150)
  ## COPY 150
  ## ALTER SEQUENCE
  ## ALTER SEQUENCE
  ## ANALYZE
  ## ANALYZE
  ## ANALYZE

Read data from PostgreSQL into R dataframe:

con2 <- DBI::dbConnect(RPostgreSQL::PostgreSQL()
  , user = Sys.getenv("SUPER_USER")
  , password = Sys.getenv("SUPER_USER_PASSWORD")
  , dbname = "app"
  , host = "localhost"
  , port = "5432"
  , options = "-c search_path=recommendation_engine"
)
prd = dplyr::tbl(con2, "product")
por = dplyr::tbl(con2, "purchase_order")
orl = dplyr::tbl(con2, "order_line")

Join the tables:

orl2 = orl %>%
  dplyr::inner_join(prd, by = "product_id") %>%
  dplyr::select(purchase_order_id, title, amount) %>%
  dplyr::arrange(purchase_order_id, title)
orl2
  ## # Source:     lazy query [?? x 3]
  ## # Database:   postgres 10.0.0 [superuser@localhost:5432/app]
  ## # Ordered by: purchase_order_id, title
  ##    purchase_order_id                 title amount
  ##                <int>                 <chr>  <int>
  ##  1                 1       citroen egsoz a      2
  ##  2                 1        kia filitre 20      4
  ##  3                 1     opel aynalar fk 4      5
  ##  4                 1        opel egsoz 160      1
  ##  5                 2    bmw silecekler 120      3
  ##  6                 2     bmw silecekler 39      1
  ##  7                 2 citroen yağlama 1.9 j      4
  ##  8                 2    honda şanzıman 120      2
  ##  9                 2          opel far 160      5
  ## 10                 2   twingo yakıt 3.2 jk      5
  ## # ... with more rows

Target shape requires to concatenate all products in one purchase order transaction together:

orl3 = orl2 %>%
  dplyr::collect() %>%
  dplyr::group_by(purchase_order_id) %>%
  dplyr::mutate(key = sprintf("x%s", row_number())) %>%
  dplyr::ungroup() %>%
  dplyr::mutate( row = row_number() ) 
orl3
  ## # A tibble: 150 x 5
  ##    purchase_order_id                 title amount   key   row
  ##                <int>                 <chr>  <int> <chr> <int>
  ##  1                 1       citroen egsoz a      2    x1     1
  ##  2                 1        kia filitre 20      4    x2     2
  ##  3                 1     opel aynalar fk 4      5    x3     3
  ##  4                 1        opel egsoz 160      1    x4     4
  ##  5                 2    bmw silecekler 120      3    x1     5
  ##  6                 2     bmw silecekler 39      1    x2     6
  ##  7                 2 citroen yağlama 1.9 j      4    x3     7
  ##  8                 2    honda şanzıman 120      2    x4     8
  ##  9                 2          opel far 160      5    x5     9
  ## 10                 2   twingo yakıt 3.2 jk      5    x6    10
  ## # ... with 140 more rows

orl4 = orl3 %>%
  dplyr::select(purchase_order_id, key, title) %>%
  tidyr::spread(key = key, value = title) 
orl4 
  ## # A tibble: 30 x 11
  ##    purchase_order_id                      x1   x10                     x2
  ##  *             <int>                   <chr> <chr>                  <chr>
  ##  1                 1         citroen egsoz a  <NA>         kia filitre 20
  ##  2                 2      bmw silecekler 120  <NA>      bmw silecekler 39
  ##  3                 3 honda göstergeler 1.5 j  <NA> kia yönlendirme 3.2 jk
  ##  4                 4 citroen göstergeler 392  <NA>      ford elektrik cax
  ##  5                 5         honda fren fk 4  <NA>  mercedes balans 1.5 j
  ##  6                 6         citroen far 392  <NA>       dacia aynalar 12
  ##  7                 7         dacia rot 1.9 j  <NA>                   <NA>
  ##  8                 8         citroen diğer a  <NA>       dacia aynalar 12
  ##  9                 9      citroen mekanik 20  <NA>   ford yönlendirme 160
  ## 10                10    ford yönlendirme 160  <NA>        opel aynalar 39
  ## # ... with 20 more rows, and 7 more variables: x3 <chr>, x4 <chr>,
  ## #   x5 <chr>, x6 <chr>, x7 <chr>, x8 <chr>, x9 <chr>

.libPaths("/Users/mertnuhoglu/.exploratory/R/3.4")
library(exploratory)

# Data Analysis Steps
res = orl4 %>%
  dplyr::select(-purchase_order_id) %>%
  dplyr::mutate(transaction_id = row_number()) %>%
  tidyr::gather(key, product, -transaction_id, na.rm = TRUE, convert = TRUE) %>%
  dplyr::arrange(transaction_id) %>%
  exploratory::do_apriori(product, transaction_id, min_support = 0.0001) %>%
  dplyr::filter(support > 0.0004) %>%
  dplyr::group_by(rhs) %>%
  dplyr::top_n(3, lift)
res

  ## # A tibble: 9,966 x 5
  ## # Groups:   rhs [115]
  ##                      lhs                   rhs    support confidence  lift
  ##                    <chr>                 <chr>      <dbl>      <dbl> <dbl>
  ##  1       honda fren fk 4 mercedes balans 1.5 j 0.03333333        1.0    15
  ##  2 mercedes balans 1.5 j       honda fren fk 4 0.03333333        0.5    15
  ##  3     bmw mekanik 1.7 j      citroen fren 105 0.03333333        1.0    30
  ##  4      citroen fren 105     bmw mekanik 1.7 j 0.03333333        1.0    30
  ##  5     bmw mekanik 1.7 j       renault rot 1.3 0.03333333        1.0    30
  ##  6       renault rot 1.3     bmw mekanik 1.7 j 0.03333333        1.0    30
  ##  7          dacia aks 20        kia balans cax 0.03333333        1.0    30
  ##  8        kia balans cax          dacia aks 20 0.03333333        1.0    30
  ##  9          dacia aks 20         kia far 1.5 j 0.03333333        1.0    30
  ## 10         kia far 1.5 j          dacia aks 20 0.03333333        1.0    30
  ## # ... with 9,956 more rows

res = orl4 %>%
  dplyr::select(-purchase_order_id) %>%
  dplyr::mutate(transaction_id = row_number()) %>%
  tidyr::gather(key, product, -transaction_id, na.rm = TRUE, convert = TRUE) %>%
  dplyr::arrange(transaction_id) %>%
  exploratory::do_apriori(product, transaction_id, min_support = 0.0001) %>%
  dplyr::filter(support > 0.0004) %>%
  dplyr::group_by(rhs) %>%
  dplyr::top_n(3, lift)
res
  ## # A tibble: 9,966 x 5
  ## # Groups:   rhs [115]
  ##                      lhs                   rhs    support confidence  lift
  ##                    <chr>                 <chr>      <dbl>      <dbl> <dbl>
  ##  1       honda fren fk 4 mercedes balans 1.5 j 0.03333333        1.0    15
  ##  2 mercedes balans 1.5 j       honda fren fk 4 0.03333333        0.5    15
  ##  3     bmw mekanik 1.7 j      citroen fren 105 0.03333333        1.0    30
  ##  4      citroen fren 105     bmw mekanik 1.7 j 0.03333333        1.0    30
  ##  5     bmw mekanik 1.7 j       renault rot 1.3 0.03333333        1.0    30
  ##  6       renault rot 1.3     bmw mekanik 1.7 j 0.03333333        1.0    30
  ##  7          dacia aks 20        kia balans cax 0.03333333        1.0    30
  ##  8        kia balans cax          dacia aks 20 0.03333333        1.0    30
  ##  9          dacia aks 20         kia far 1.5 j 0.03333333        1.0    30
  ## 10         kia far 1.5 j          dacia aks 20 0.03333333        1.0    30
  ## # ... with 9,956 more rows

Data Generation and Recommendation Engine Using Association Rules (Market Basket) in R

Mert Nuhoglu