Commit 1516e662 authored by hlgr's avatar hlgr
Browse files

added framework

parents
Project(CO572CW1)
cmake_minimum_required(VERSION 3.7)
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
if (MSVC)
add_compile_options(/W4 /WX)
else()
if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL armv7l)
# setting architecture explicitly on Pi because raspbian doesn't actually use correct ISA
add_compile_options(-Wall -Wextra -pedantic -Werror -march=armv8-a+crc)
else()
add_compile_options(-Wall -Wextra -pedantic -Werror -march=native)
endif()
endif()
# --------------------------------------------------
include(ExternalProject)
ExternalProject_Add(googlebenchmark
URL "https://github.com/google/benchmark/archive/v1.5.0.tar.gz"
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CO572CW1_BINARY_DIR}/deps -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -DBENCHMARK_ENABLE_GTEST_TESTS=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
)
ExternalProject_Add(catch2
URL "https://github.com/catchorg/Catch2/archive/v2.9.1.tar.gz"
CMAKE_ARGS -DCATCH_BUILD_TESTING=OFF -DCMAKE_INSTALL_PREFIX=${CO572CW1_BINARY_DIR}/deps -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
)
# --------------------------------------------------
add_executable(microbenchmarks microbenchmarks.cpp solution.c)
add_dependencies(microbenchmarks googlebenchmark)
set_property(TARGET microbenchmarks PROPERTY CXX_STANDARD 14)
target_link_libraries(microbenchmarks Threads::Threads)
target_link_libraries(microbenchmarks ${CO572CW1_BINARY_DIR}/deps/lib/${CMAKE_SHARED_LIBRARY_PREFIX}benchmark.a)
target_include_directories(microbenchmarks SYSTEM PUBLIC ${CO572CW1_BINARY_DIR}/deps/include)
# --------------------------------------------------
add_executable(macrobenchmark macrobenchmark.cpp solution.c)
add_dependencies(macrobenchmark googlebenchmark)
set_property(TARGET macrobenchmark PROPERTY CXX_STANDARD 14)
target_link_libraries(macrobenchmark Threads::Threads)
target_link_libraries(macrobenchmark ${CO572CW1_BINARY_DIR}/deps/lib/${CMAKE_SHARED_LIBRARY_PREFIX}benchmark.a)
target_include_directories(macrobenchmark SYSTEM PUBLIC ${CO572CW1_BINARY_DIR}/deps/include)
# --------------------------------------------------
add_executable(tests tests.cpp solution.c)
add_dependencies(tests catch2)
set_property(TARGET tests PROPERTY CXX_STANDARD 14)
target_include_directories(tests SYSTEM PUBLIC ${CO572CW1_BINARY_DIR}/deps/include)
#+TITLE: Advanced Databases (CO572) -- Indexing and Querying Coursework
#+LaTeX_HEADER: \usepackage{fullpage}
* Introduction
The objective of this coursework is to practice the complex interplay
between data storage and processing. Depending on the available
indices, processing can be implemented using different algorithms with
various optimizations.
You will work with a databae of the following schema
#+begin_src sql :exports code
CREATE TABLE item (salesdate INT, employee INT, price INT);
CREATE TABLE order (
salesdate INT,
employee INT,
employeemanagerid INT,
discount INT
);
CREATE TABLE store (
managerid INT,
latitude INT,
longitude INT,
countryid INT
);
#+end_src
* Getting started
To get started log in to a lab machine and run the following sequence of commands:
#+BEGIN_SRC bash
git clone git@gitlab.doc.ic.ac.uk:ak10318/adb-coursework.git
cd adb-coursework
#+END_SRC
You may want to set up two separate build directories for the code,
one for debugging and one for benchmarking. Here is how you could do
that:
#+begin_src bash :exports code
mkdir Debug
cd Debug
cmake -DCMAKE_BUILD_TYPE=Debug ..
cd ..
mkdir Release
cd Release
cmake -DCMAKE_BUILD_TYPE=Release ..
cd ..
#+end_src
You can compile each by (respectivel) typing:
#+begin_src bash :exports code
cmake --build Debug
#+end_src
or
#+begin_src bash :exports code
cmake --build Release
#+end_src
Note that the first time you build each of these will take a long time
since it also builds dependencies.
** Testing
To run the tests, simply run
#+begin_src bash :exports code
./Debug/tests
#+end_src
a successful run output should look like this (pass -? for more options)
#+begin_src bash :exports code
===============================================================================
All tests passed (30 assertions in 3 test cases)
#+end_src
** Benchmarking
To run the benchmarks, simply run
#+begin_src bash :exports code
./Release/microbenchmarks
#+end_src
a semi-naive implementation (building no indices) would produce output like this
#+begin_src bash :exports code
Running ./Release/microbenchmarks
Run on (4 X 1200 MHz CPU s)
Load Average: 0.31, 0.81, 0.74
-------------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------------
CreateIndicesBenchmark/1024 2.64 us 2.66 us 262929
CreateIndicesBenchmark/4096 2.65 us 2.66 us 262897
CreateIndicesBenchmark/32768 2.93 us 2.95 us 238952
CreateIndicesBenchmark/262144 2.93 us 2.94 us 237840
CreateIndicesBenchmark/1048576 2.93 us 2.94 us 237703
Query1Benchmark/1024 95.9 us 95.9 us 7192
Query1Benchmark/4096 1010 us 1010 us 704
Query1Benchmark/32768 46032 us 46030 us 15
Query1Benchmark/262144 2038912 us 2038335 us 1
Query1Benchmark/1048576 31529341 us 31528313 us 1
Query2Benchmark/1024 301 us 301 us 2328
Query2Benchmark/4096 5690 us 5690 us 117
Query2Benchmark/32768 120979 us 120959 us 6
Query2Benchmark/262144 5047080 us 5046606 us 1
Query2Benchmark/524288 10067662 us 10067067 us 1
Query3Benchmark/1024 1117 us 1117 us 629
Query3Benchmark/4096 11162 us 11159 us 67
Query3Benchmark/32768 565663 us 565543 us 1
Query3Benchmark/262144 38888371 us 38886797 us 1
#+end_src
a good solution (making use of appropriate indices) produces output
like this (same hardware):
#+begin_src bash :exports code
Running ./Release/microbenchmarks
Run on (4 X 1200 MHz CPU s)
Load Average: 0.24, 0.49, 0.63
-------------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------------
CreateIndicesBenchmark/1024 1364 us 1364 us 515
CreateIndicesBenchmark/4096 5496 us 5497 us 127
CreateIndicesBenchmark/32768 52534 us 52535 us 13
CreateIndicesBenchmark/262144 504598 us 504595 us 1
CreateIndicesBenchmark/1048576 2237887 us 2237300 us 1
Query1Benchmark/1024 249 us 249 us 2808
Query1Benchmark/4096 1460 us 1460 us 480
Query1Benchmark/32768 11800 us 11796 us 60
Query1Benchmark/262144 99317 us 99310 us 7
Query1Benchmark/1048576 397303 us 397276 us 2
Query2Benchmark/1024 139 us 139 us 5034
Query2Benchmark/4096 2126 us 2126 us 329
Query2Benchmark/32768 17721 us 17720 us 37
Query2Benchmark/262144 22913 us 22913 us 28
Query2Benchmark/524288 28612 us 28611 us 18
Query3Benchmark/1024 643 us 643 us 1090
Query3Benchmark/4096 4174 us 4173 us 168
Query3Benchmark/32768 38987 us 38984 us 18
Query3Benchmark/262144 346434 us 346411 us 2
#+end_src
* Your task
Your task is to implement three queries using the techniques,
algorithms and data structures you have learned about in class. You
shall also implement some form of indexing to accelerate the
queries. Your are free to implement an index structure of your
choosing but you need to justify your choice.
The file ~solution.c~ contains stubs for four functions: three of them
need to be filled with the implementation of the queries and one is a
preparation function you can use to build your index.
** Q1
#+begin_src sql :exports code
SELECT
COUNT(*)
FROM
Items,
Orders
WHERE
Items.price < 1000
AND Orders.employeeManagerID = 10
AND Items.salesDate = Orders.salesDate
AND Items.employee = Orders.employee
#+end_src
** Q2:
#+begin_src sql :exports code
SELECT
COUNT(*)
FROM
Items,
Orders
WHERE
Orders.discount = 10
AND Items.salesDate <= Orders.salesDate
AND Orders.salesDate <= Items.salesDate + 14
#+end_src
* Q3:
#+begin_src sql :exports code
SELECT
COUNT(*)
FROM
Items,
Orders,
Stores
WHERE
Stores.managerID = Orders.employeeManagerID
AND Items.salesDate = Orders.salesDate
AND Items.employee = Orders.employee
#+end_src
* Solution and Marking
Coursework shall be handed in in teams of two. Form teams and
designate one of you as lead. When handing in your solution, place a
file named ~partner.txt~ in the root of the repository. When marking
the solutions, we will read the first line of this file and attribute
the same marks to the login mentioned there.
The marks are distributed as follows:
- Correct implementation of the queries/passing tests: 40%
- Implementation of at least one indexing structure and use of the
index to accelerate query processing: 30%
- Justification of the decision to implement this index structure: 20%
- Something extra: 10%
The "extra" can be anything: a detailed performance analysis, hybrid
index structures, adaptive indexing, exploitation of hardware
features, etc. If you are unsure if your "extra" is enough to get full
marks, raise the matter after class.
* Submission
The coursework will be submitted using LabTS. Important are three
files that shall be in the root directory of the repository:
- ~solution.c~, containing *all* code pertaining to your solution. No
other files shall be modified!
- ~explanation.txt~, explaining the solution: what indices were
implemented and why? Also use this to justify your "extra". What is
cool about your solution and why do you deserve 10% extra marks
- ~partner.txt~, the file containing the name of your teammate
* Competition
In addition to, but completely unrelated to, the coursework, we are
having a competition. For that, we will run your queries in a
"macrobenchmark", i.e., in a sequence simulating the workload of a
real data management system. To make things interesting, we are
running your solution in a highly resource-constrained environment: a
raspberry pi (Model 3B). Every time you trigger a test on labts, the
your solution is tested and uploaded to the leaderboard. The
leaderboard can be accessed at [[http://dbtitans.lsds.uk]].
#ifndef _DATA_GENERATOR_H_
#define _DATA_GENERATOR_H_
#include <algorithm>
#include <chrono>
#include <cstdint>
#include <cstdlib>
#include <random>
#include "database.h"
namespace cppref {
using namespace std;
// implementation taken from cpp-ref (just to be portable)
template <class RandomIt, class URBG> void shuffle(RandomIt first, RandomIt last, URBG&& g) {
auto n = last - first;
for(auto i = n - 1; i > 0; --i) {
std::swap(first[i], first[g() % n]); // ignoring modulo skew for the sake of portablility
}
}
} // namespace cppref
template <typename Generator = std::linear_congruential_engine<unsigned int, 16807, 0, 2147483647>>
void GenerateData(Database& db, size_t itemsCardinality = 16384) {
Generator lcg(221);
auto averageNumberOfItemsPerOrder = 4ul;
auto numberOfSalesPerDay = 32;
auto numberOfStores = 256 * std::lround(std::log2(itemsCardinality));
auto numberOfUniqueManagers = numberOfStores / 4; // four stores per manager
auto numberOfUniqueCountries = 196;
auto numberOfUniqueEmployees = numberOfUniqueManagers * 8;
auto numberOfUniquePrices = itemsCardinality / 2;
db.itemsCardinality = itemsCardinality;
db.ordersCardinality = itemsCardinality / averageNumberOfItemsPerOrder;
db.storesCardinality = numberOfStores;
db.items = new ItemTuple[db.itemsCardinality];
db.orders = new OrderTuple[db.ordersCardinality];
db.stores = new StoreTuple[db.storesCardinality];
for(auto i = 0ul; i < db.storesCardinality; i++) {
db.stores[i].managerID = lcg() % numberOfUniqueManagers;
db.stores[i].countryID = lcg() % numberOfUniqueCountries;
db.stores[i].longitude = lcg();
db.stores[i].latitude = lcg();
}
for(auto i = 0ul; i < db.ordersCardinality; i+=2) {
auto salesDate = (i / numberOfSalesPerDay) + lcg() % 2;
for (size_t j = 0; j < 2; j++)
{
db.orders[i + j].salesDate = salesDate;
db.orders[i + j].employee = i % numberOfUniqueEmployees;
db.orders[i + j].employeeManagerID = db.stores[lcg() % db.storesCardinality].managerID;
db.orders[i + j].discount = (lcg()) % 100;
}
}
cppref::shuffle(db.orders, db.orders + db.ordersCardinality - 1, lcg);
auto itemsCursor = 0ul;
for(size_t i = 0ul; i < db.ordersCardinality; i++) {
auto v = lcg();
int numberOfItemsPerOrderVariation = ((v & 2)>>1) + (v & 1) - 1;
size_t numberOfItemsPerOrder =
averageNumberOfItemsPerOrder + numberOfItemsPerOrderVariation; // fix this
for(size_t j = 0; (j < numberOfItemsPerOrder) && (itemsCursor < db.itemsCardinality); j++) {
db.items[itemsCursor].salesDate = db.orders[i].salesDate;
db.items[itemsCursor].employee = db.orders[i].employee;
db.items[itemsCursor].price = lcg() % numberOfUniquePrices;
itemsCursor++;
}
}
std::lognormal_distribution<double> distribution(.3, .8);
for(size_t i = 0; i < db.itemsCardinality; i++)
std::swap(
db.items[i],
db.items[std::min(db.itemsCardinality, static_cast<size_t>(floorl(distribution(lcg))))]);
}
void FreeDatabaseTables(struct Database& db) {
delete[] db.items;
delete[] db.orders;
delete[] db.stores;
}
#endif
#ifndef DATABASE_H_
#define DATABASE_H_
#include <stdlib.h>
struct ItemTuple
{
int salesDate;
int employee;
int price;
};
struct OrderTuple
{
int salesDate;
int employee;
int employeeManagerID;
int discount;
};
struct StoreTuple
{
int managerID;
int latitude;
int longitude;
int countryID;
};
struct Database {
struct ItemTuple* items;
size_t itemsCardinality;
struct OrderTuple* orders;
size_t ordersCardinality;
struct StoreTuple* stores;
size_t storesCardinality;
void* indices;
};
#endif
#include "data_generator.h"
#include "database.h"
#include "solution.h"
#include <benchmark/benchmark.h>
static void queryMix(benchmark::State& state) {
Database db{};
GenerateData(db, state.range(0));
std::default_random_engine generator;
std::uniform_int_distribution<> managerIDs_distribution(0, (db.storesCardinality / 4));
std::uniform_int_distribution<> prices_distribution(0, (db.itemsCardinality / 2));
std::uniform_int_distribution<> dates_distribution(0, (db.ordersCardinality / 32));
std::uniform_int_distribution<> discounts_distribution(0, 100);
std::uniform_int_distribution<> countryIDs_distribution(0, 196);
for(auto _ : state) {
CreateIndices(&db);
for(auto i = 0; i < 3; i++) {
benchmark::DoNotOptimize(
Query1(&db, managerIDs_distribution(generator), prices_distribution(generator)));
benchmark::DoNotOptimize(
Query2(&db, discounts_distribution(generator), dates_distribution(generator)));
benchmark::DoNotOptimize(Query3(&db, countryIDs_distribution(generator)));
}
DestroyIndices(&db);
db.indices = nullptr;
}
FreeDatabaseTables(db);
}
BENCHMARK(queryMix)->Arg(2 * 1024 * 1024)->Unit(benchmark::kMicrosecond);
BENCHMARK_MAIN();
#include <benchmark/benchmark.h>
#include "data_generator.h"
#include "database.h"
#include "solution.h"
static void CreateIndicesBenchmark(benchmark::State& state) {
Database db{};
GenerateData(db, state.range(0));
for(auto _ : state) {
CreateIndices(&db);
state.PauseTiming();
DestroyIndices(&db);
db.indices = nullptr;
state.ResumeTiming();
}
FreeDatabaseTables(db);
}
BENCHMARK(CreateIndicesBenchmark)->Range(1024, 1024 * 1024)->Unit(benchmark::kMicrosecond);
static void Query1Benchmark(benchmark::State& state) {
Database db{};
GenerateData(db, state.range(0));
CreateIndices(&db);
std::default_random_engine generator;
std::uniform_int_distribution<> managerIDs_distribution(0, (db.storesCardinality / 4));
std::uniform_int_distribution<> prices_distribution(0, (db.itemsCardinality / 2));
for(auto _ : state)
for(auto i = 0; i < 10; i++)
benchmark::DoNotOptimize(
Query1(&db, managerIDs_distribution(generator), prices_distribution(generator)));
DestroyIndices(&db);
db.indices = nullptr;
FreeDatabaseTables(db);
}
BENCHMARK(Query1Benchmark)->Range(1024, 1024 * 1024)->Unit(benchmark::kMicrosecond);
static void Query2Benchmark(benchmark::State& state) {
Database db{};
GenerateData(db, state.range(0));
CreateIndices(&db);
std::default_random_engine generator;
std::uniform_int_distribution<> dates_distribution(0, (db.ordersCardinality / 32));
std::uniform_int_distribution<> discounts_distribution(0, 100);
for(auto _ : state)
for(auto i = 0; i < 10; i++)
benchmark::DoNotOptimize(
Query2(&db, dates_distribution(generator), discounts_distribution(generator)));
DestroyIndices(&db);
db.indices = nullptr;
FreeDatabaseTables(db);
}
BENCHMARK(Query2Benchmark)->Range(1024, 512 * 1024)->Unit(benchmark::kMicrosecond);
static void Query3Benchmark(benchmark::State& state) {
Database db{};
GenerateData(db, state.range(0));
CreateIndices(&db);
std::default_random_engine generator;
std::uniform_int_distribution<> countryIDs_distribution(0, 196);
for(auto _ : state)
for(auto i = 0; i < 10; i++)
benchmark::DoNotOptimize(Query3(&db, countryIDs_distribution(generator)));
DestroyIndices(&db);
db.indices = nullptr;
FreeDatabaseTables(db);
}
BENCHMARK(Query3Benchmark)->Range(1024, 256 * 1024)->Unit(benchmark::kMicrosecond);
BENCHMARK_MAIN();
#include "solution.h"
int Query1(struct Database* db, int managerID, int price) { return 0; }
int Query2(struct Database* db, int discount, int date) { return 0; }
int Query3(struct Database* db, int countryID) { return 0; }
void CreateIndices(struct Database* db) { (void)db; }
void DestroyIndices(struct Database* db) {
/// Free database indices
db->indices = NULL;
(void)db;
}
#ifndef _SOLUTION_H_
#define _SOLUTION_H_
#include "database.h"
#ifdef __cplusplus
extern "C" {
#endif
int Query1(struct Database* db, int managerID, int price);
int Query2(struct Database* db, int discount, int date);
int Query3(struct Database* db, int countryID);
void CreateIndices(struct Database* db);
void DestroyIndices(struct Database* db);
#ifdef __cplusplus
}
#endif
#endif
#define CATCH_CONFIG_MAIN
#include "data_generator.h"
#include "database.h"
#include "solution.h"
#include <catch2/catch.hpp>
int managerIDs[10] = {7, 26, 103, 14, 77, 42, 54, 112, 5, 115};
int prices[10] = {200, 1205, 7221, 6590, 1800, 750, 968, 3500, 4550, 5225};
int discounts[10] = {10, 5, 80, 90, 70, 40, 55, 85, 7, 35};
int dates[10] = {2, 12, 7, 21, 18, 5, 13, 15, 8, 11};
int countryIDs[10] = {5, 105, 41, 52, 70, 38, 191, 11, 7, 39};
int query1_results[10] = {2, 9, 32, 14, 22, 5, 5, 8, 53, 21};
int query2_results[10] = {16687, 73210, 46038, 64247, 86670, 33833, 63174, 72352, 53691, 51191};
int query3_results[10] = {468, 1277, 801, 665, 950, 1118, 500, 571, 979, 709};
TEST_CASE("Queries 1 Solution works", "[queries]") {
Database db;
GenerateData(db);
CreateIndices(&db);
auto i = GENERATE(range(0, 10));
REQUIRE(Query1(&db, managerIDs[i], prices[i]) == query1_results[i]);
DestroyIndices(&db);
db.indices = nullptr;
FreeDatabaseTables(db);
}