Commit 28e188e0 authored by hlgr's avatar hlgr
Browse files

create the project

parents
cmake_minimum_required(VERSION 3.12)
add_executable(Joins Joins.cpp Volcano.cpp Storage.cpp Coursework1_2018.cpp )
set_property(TARGET Joins PROPERTY CXX_STANDARD 17)
#include <iostream>
#include "Volcano.hpp"
#include "Joins.hpp"
#include <unordered_set>
#include <set>
#include <algorithm>
#include <cxxabi.h>
#include <random>
#include <iomanip>
using namespace std;
static Table customerReferenceData{
{1L, "Holger", "180 Queens Gate"}, {2L, "Sam", "32 Vassar Street"},
{4L, "Daniel", "180 Queens Gate"}, {5L, "Robert", "32 Vassar Street"},
{7L, "Thomas", "180 Queens Gate"}, {8L, "Mike", "32 Vassar Street"},
{3L, "Peter", "180 Queens Gate"}, {6L, "Peter", "180 Queens Gate"}};
static Table ordersReferenceData{{1L, 2L}, {2L, 3L}, {3L, 1L}, {4L, 2L},
{7L, 2L}, {5L, 3L}, {9L, 1L}, {8L, 2L}};
namespace utility {
bool compareResults(unique_ptr<Operator> plan, unique_ptr<Operator> reference, string caseName) {
multiset<Tuple> referenceResult;
multiset<Tuple> achievedResult, delta;
reference->open();
for(auto t = reference->next(); t; t = reference->next())
referenceResult.insert(t);
plan->open();
for(auto t = plan->next(); t; t = plan->next())
achievedResult.insert(t);
set_symmetric_difference(referenceResult.begin(), referenceResult.end(), achievedResult.begin(),
achievedResult.end(), inserter(delta, end(delta)));
cout << caseName << " correct: " << boolalpha << (delta.size() == 0) << endl;
if(delta.size() > 20)
cout << "way to many differences to display" << endl;
else {
for_each(begin(delta), end(delta), [&referenceResult](auto const& t) {
cout << "(" << t << "): " << (referenceResult.count(t) ? "missing" : "superfluous")
<< " in result" << endl;
});
}
return delta.size() == 0;
}
}
namespace reference {
auto joinUsingCrossProduct(Table& customer, Table& orders, bool tablesSwapped = false) {
return make_unique<Select>(
make_unique<Cross>(make_unique<Scan>(customer), make_unique<Scan>(orders)),
[tablesSwapped](auto t) { return t[0] == t[tablesSwapped ? 2 : 3]; });
}
unique_ptr<Operator> inequalityJoin(Table& customer, Table& orders) {
return make_unique<Select>(
make_unique<Cross>(make_unique<Scan>(customer), make_unique<Scan>(orders)),
[](auto t) { return t[3] < long(t[0]) + 1 && t[3] > long(t[0]) - 1; });
}
}
namespace queries {
template <typename JoinImplementation>
unique_ptr<Operator> equalityJoin(Table& customer, Table& orders) {
return make_unique<JoinImplementation>(make_unique<Scan>(customer), 0, make_unique<Scan>(orders),
0, orders.size());
}
template <typename JoinImplementation>
unique_ptr<Operator> inequalityJoin(Table& customer, Table& orders) {
return make_unique<JoinImplementation>(make_unique<Scan>(customer), 0, pair{-1, 1},
make_unique<Scan>(orders), 0, orders.size());
}
}
namespace run {
template <typename T> void equalityQuery(bool needsSorting = false) {
auto customer = customerReferenceData;
auto orders = ordersReferenceData;
if(needsSorting) {
sort(begin(customer), end(customer),
[](auto const& t1, auto const& t2) { return t1[0] < t2[0]; });
sort(begin(orders), end(orders), [](auto const& t1, auto const& t2) { return t1[0] < t2[0]; });
}
cout << "Testing Equality Join: " << abi::__cxa_demangle(typeid(T).name(), 0, 0, NULL) << "..."
<< endl;
utility::compareResults(queries::equalityJoin<T>(customer, orders),
reference::joinUsingCrossProduct(customer, orders), "Customer x Orders");
utility::compareResults(queries::equalityJoin<T>(orders, customer),
reference::joinUsingCrossProduct(orders, customer, true),
"Orders x Customer");
cout << "--------------------" << endl;
}
template <typename T> void inequalityQuery() {
auto customer = customerReferenceData;
auto orders = ordersReferenceData;
cout << "Testing Inequality Join: " << abi::__cxa_demangle(typeid(T).name(), 0, 0, NULL) << "..."
<< endl;
utility::compareResults(queries::inequalityJoin<T>(customer, orders),
reference::inequalityJoin(customer, orders), "Customer <> Orders");
cout << "--------------------" << endl;
}
}
namespace runAtScale {
pair<Table, Table> generateData(size_t scale, bool needsSorting) {
Table customer(scale);
for(size_t i = 0; i < customer.size(); i++)
customer[i] = Tuple({long(i * 3), long((i * 312839) % 372189), long(17 * i)});
Table orders(scale * 3);
for(size_t i = 0, j = 0; i < orders.size(); i++)
orders[i] = Tuple({long((j += 1 + (rand() % 3))), long(17 * i)});
if(!needsSorting) {
std::random_device rd;
std::mt19937 g(rd());
shuffle(begin(customer), end(customer), g);
shuffle(begin(orders), end(orders), g);
}
return {customer, orders};
}
template <typename T>
void equalityQuery(size_t scale = 1000, bool needsSorting = false, bool checkCorrectness = true) {
auto generatedData = generateData(scale, needsSorting);
auto customer = generatedData.first;
auto orders = generatedData.second;
cout << "Testing Equality Join At Scale " << setw(9) << scale << " : "
<< abi::__cxa_demangle(typeid(T).name(), 0, 0, NULL) << "...";
auto case1Start = chrono::high_resolution_clock::now();
auto case1 = queries::equalityJoin<T>(customer, orders);
auto between = chrono::high_resolution_clock::now();
auto case2 = queries::equalityJoin<T>(orders, customer);
auto case2Finish = chrono::high_resolution_clock::now();
if(checkCorrectness) {
cout << endl;
utility::compareResults(move(case1), reference::joinUsingCrossProduct(customer, orders),
"Customer x Orders");
utility::compareResults(move(case2), reference::joinUsingCrossProduct(orders, customer, true),
"Orders x Customer");
cout << "--------------------" << endl;
} else {
std::cout << "Customer Left: " << setw(9)
<< (chrono::duration_cast<chrono::microseconds>(between - case1Start).count())
<< "us";
std::cout << ", Orders Left: " << setw(9)
<< (chrono::duration_cast<chrono::microseconds>(case2Finish - between).count())
<< "us" << endl;
}
}
}
int main() {
bool enableBonus = false;
cout << endl;
cout << endl << "testing correctness";
cout << endl << "===================" << endl;
run::equalityQuery<NestedLoopsJoin>();
run::equalityQuery<HashJoin>();
run::equalityQuery<SortMergeJoin>(true);
cout << endl;
cout << endl << "scaling experiments ";
cout << endl << "===================" << endl;
for(size_t i = 10; i < 1001; i *= 10) {
runAtScale::equalityQuery<NestedLoopsJoin>(i);
runAtScale::equalityQuery<HashJoin>(i);
runAtScale::equalityQuery<SortMergeJoin>(i, true);
}
cout << endl;
cout << endl << "performance testing";
cout << endl << "===================" << endl;
for(size_t i = 10; i < 10000000; i *= 10)
runAtScale::equalityQuery<NestedLoopsJoin>(i, false, false);
for(size_t i = 10; i < 10000000; i *= 10)
runAtScale::equalityQuery<HashJoin>(i, false, false);
for(size_t i = 10; i < 10000000; i *= 10)
runAtScale::equalityQuery<SortMergeJoin>(i, true, false);
if(enableBonus)
run::inequalityQuery<SortMergeJoin>();
return 0;
}
#include "Joins.hpp"
void NestedLoopsJoin::open(){};
Tuple NestedLoopsJoin::next() { return {}; };
void NestedLoopsJoin::close(){};
void HashJoin::open(){};
Tuple HashJoin::next() { return {}; };
void HashJoin::close(){};
void SortMergeJoin::open(){};
Tuple SortMergeJoin::next() { return {}; };
void SortMergeJoin::close(){};
#include "Volcano.hpp"
class Join : public Operator {
protected:
unique_ptr<Operator> left;
size_t leftAttributeID;
unique_ptr<Operator> right;
size_t rightAttributeID;
Join(unique_ptr<Operator> left, size_t leftAttributeID, unique_ptr<Operator> right,
size_t rightAttributeID)
: left(move(left)), leftAttributeID(leftAttributeID), right(move(right)),
rightAttributeID(rightAttributeID) {}
};
class NestedLoopsJoin : public Join {
public:
NestedLoopsJoin(unique_ptr<Operator> left, size_t leftAttributeID, unique_ptr<Operator> right,
size_t rightAttributeID, size_t rightSideCardinalityEstimate)
: Join(move(left), leftAttributeID, move(right), rightAttributeID) {}
void open();
Tuple next();
void close();
};
class SortMergeJoin : public Join {
public:
SortMergeJoin(unique_ptr<Operator> left, size_t leftAttributeID, pair<long, long> windowOnTheLeft,
unique_ptr<Operator> right, size_t rightAttributeID,
size_t rightSideCardinalityEstimate)
: Join(move(left), leftAttributeID, move(right), rightAttributeID) {}
SortMergeJoin(unique_ptr<Operator> left, size_t leftAttributeID, unique_ptr<Operator> right,
size_t rightAttributeID, size_t rightSideCardinalityEstimate)
: SortMergeJoin(move(left), leftAttributeID, {0, 0}, move(right), rightAttributeID,
rightSideCardinalityEstimate) {}
void open();
Tuple next();
void close();
};
class HashJoin : public Join {
public:
HashJoin(unique_ptr<Operator> left, size_t leftAttributeID, unique_ptr<Operator> right,
size_t rightAttributeID, size_t rightSideCardinalityEstimate)
: Join(move(left), leftAttributeID, move(right), rightAttributeID) {}
void open();
Tuple next();
void close();
};
#include "Storage.hpp"
class Tuple;
template<>
size_t BufferManager<Tuple>::numberOfTuplesPerPage(unsigned long tupleSize) { //
return 4096 / tupleSize;
}
#include<map>
using namespace std;
template <typename Tuple> class BufferManager {
protected:
map<string, vector<Tuple>> openPages;
map<string, vector<string>> pagesOnDisk;
size_t numberOfTuplesPerPage(size_t tupleSize);
public:
vector<Tuple>& getOpenPageForRelation(string relationName);
void commitOpenPageForRelation(string relationName);
vector<vector<Tuple>> getPagesForRelation(string relationName);
};
#include "Volcano.hpp"
Tuple Cross::next() {
if(currentBufferedRightOffset == bufferedRightTuples.size()) {
currentBufferedRightOffset = 0;
currentLeftTuple = leftChild->next();
}
if(!bool(currentLeftTuple))
return {};
auto currentRightTuple = bufferedRightTuples[currentBufferedRightOffset++];
return currentLeftTuple + currentRightTuple;
}
bool prefixesMatch(Tuple left, Tuple right) {
for(size_t i = 0; i < min(left.size(), right.size()); i++)
if(left[i] != right[i])
return false;
return true;
}
size_t nextSlot(size_t value, size_t hashtableSize);
size_t hashTuple(Tuple t, size_t hashtableSize);
void GroupBy::open() {
child->open();
auto inputTuple = child->next();
while(inputTuple) {
auto groupKeys = getGroupKeys(inputTuple);
auto hashValue = hashTuple(groupKeys, hashTable.size());
while(hashTable[hashValue].occupied && //
!prefixesMatch(groupKeys, hashTable[hashValue].data))
hashValue = nextSlot(hashValue, hashTable.size());
hashTable[hashValue].occupied = true;
if(hashTable[hashValue].data.size() != groupKeys.size() + aggregateFunctions.size())
hashTable[hashValue].data.resize(groupKeys.size() + aggregateFunctions.size());
size_t i = 0;
for(; i < groupKeys.size(); i++)
hashTable[hashValue].data[i] = groupKeys[i];
for(size_t j = 0; j < aggregateFunctions.size(); j++)
hashTable[hashValue].data[i + j] =
aggregateFunctions[j](hashTable[hashValue].data[i + j], inputTuple);
inputTuple = child->next();
}
}
size_t nextSlot(size_t i, size_t hashTableSize) {
return (i + 1) % hashTableSize;
};
size_t hashTuple(Tuple, size_t hashTableSize) { return 0; };
void simpleQuery() {
Table input{{5l}, {7l}, {7l}, {9l}};
auto plan = //
make_unique<GroupBy>( //
make_unique<Select>( //
make_unique<Scan>(input), //
[](auto t) { return t[0] < 8l; }), //
[](auto t) { return t; }, //
valarray<AggregationFunction> //
{[](auto v, auto t) { return long(v) + 1; }});
plan->open();
for(auto t = plan->next(); t; t = plan->next())
cout << t << endl;
}
void moreInterestingQuery() {
Table input{{1l, "Holger", "180 Queens Gate"},
{2l, "Sam", "32 Vassar Street"},
{3l, "Peter", "180 Queens Gate"}};
auto plan = //
make_unique<GroupBy>( //
make_unique<Select>( //
make_unique<Scan>(input), //
[](auto t) { return t[2] == string("180 Queens Gate"); }), //
[](auto t) -> Tuple { return {t[1]}; }, //
valarray<AggregationFunction> //
{[](auto v, auto t) { return long(v) + 1; }});
plan->open();
for(auto t = plan->next(); t; t = plan->next())
cout << t << endl;
}
Table readFromDisk(string const& _){
static map<string, Table> filesystem{};
if(filesystem.count(_) == 0)
filesystem[_] = {};
return filesystem[_];
}
size_t BufferedGroupBy::nextGroupOperatorID = 0;
size_t nextSlot(size_t value, size_t hashtableSize);
size_t hashTuple(Tuple t, size_t hashtableSize);
Tuple& BufferedGroupBy::getHashTableEntry(size_t id, Tuple const& groupKeys) {
return bufferManager.getPageForRelationWithID(
relationName,
id /
bufferManager.numberOfTuplesPerPage(hashTableEntrySize(
groupKeys)))[id % bufferManager.numberOfTuplesPerPage(hashTableEntrySize(groupKeys))];
};
size_t BufferedGroupBy::hashTableEntrySize(Tuple const& groupKeys) const {
return groupKeys.size() + aggregateFunctions.size() + 1;
}
void BufferedGroupBy::open() {
auto inputTuple = child->next();
bufferManager.createRelation("groupBuffer" + to_string(nextGroupOperatorID), hashTableSize,
hashTableEntrySize(getGroupKeys(inputTuple)));
while(inputTuple) {
auto groupKeys = getGroupKeys(inputTuple);
auto hashValue = hashTuple(groupKeys, hashTableSize);
for(auto& entry = getHashTableEntry(hashValue, groupKeys);
long(entry[0]) > 0 && !prefixesMatch(groupKeys, entry[slice(1, entry.size() - 1, 1)]);
entry = getHashTableEntry(hashValue, groupKeys)) {
}
// while(long(getHashTableEntry(hashValue, groupKeys)[0]) > 0 && //
// !prefixesMatch(groupKeys, getHashTableEntry(
// hashValue, groupKeys)[slice(1, hashValue.size() - 1, 1)]))
// hashValue = nextSlot(hashValue, hashTableSize);
auto& hashTableEntry = getHashTableEntry(hashValue, groupKeys);
if(hashTableEntry.size() != groupKeys.size() + aggregateFunctions.size() + 1)
hashTableEntry.resize(groupKeys.size() + aggregateFunctions.size() + 1);
hashTableEntry[0] = {1L};
size_t i = 1;
for(; i < groupKeys.size(); i++)
hashTableEntry[i] = groupKeys[i];
for(size_t j = 0; j < aggregateFunctions.size(); j++)
hashTableEntry[i + j] = aggregateFunctions[j](hashTableEntry[i + j], inputTuple);
inputTuple = child->next();
}
}
#pragma once
#include <valarray>
#include <variant>
#include <array>
#include <map>
#include <vector>
#include <variant>
#include <any>
#include <string>
#include <iostream>
#include "Storage.hpp"
using namespace std;
struct SupportedDatatype : public variant<long, double, string> { // <- supported datatypes
// All of this stuff is convenience functions
using variant<long, double, string>::variant;
template <typename T, typename = std::enable_if_t<negation<is_same<T, SupportedDatatype>>::value>>
bool operator==(T other) const {
return get_if<T>(this) && *get_if<T>(this) == other;
}
template <typename T, typename = std::enable_if_t<negation<is_same<T, SupportedDatatype>>::value>>
bool operator<(T other) const {
return get_if<T>(this) && *get_if<T>(this) < other;
}
template <typename T, typename = std::enable_if_t<negation<is_same<T, SupportedDatatype>>::value>>
bool operator>(T other) const {
return get_if<T>(this) && *get_if<T>(this) > other;
}
template <typename T, typename = std::enable_if_t<negation<is_same<T, SupportedDatatype>>::value>>
bool operator>=(T other) const {
return get_if<T>(this) && *get_if<T>(this) >= other;
}
template <typename T, typename = std::enable_if_t<negation<is_same<T, SupportedDatatype>>::value>>
bool operator<=(T other) const {
return get_if<T>(this) && *get_if<T>(this) <= other;
}
template <typename T, typename = std::enable_if_t<negation<is_same<T, SupportedDatatype>>::value>>
friend bool operator>=(T other, SupportedDatatype const& t) {
return get_if<T>(&t) && other >= *get_if<T>(&t);
}
template <typename T, typename = std::enable_if_t<negation<is_same<T, SupportedDatatype>>::value>>
friend bool operator<=(T other, SupportedDatatype const& t) {
return get_if<T>(&t) && other <= *get_if<T>(&t);
}
bool operator<(SupportedDatatype const& t) const {
if(index() < t.index())
return true;
else if(index() > t.index())
return false;
else {
return map<size_t, function<bool()>>{{0, [&]() { return *this < *get_if<0>(&t); }},
{1, [&]() { return *this < *get_if<1>(&t); }},
{2, [&]() { return *this < *get_if<2>(&t); }}}
.at(index())();
}
}
bool operator>(SupportedDatatype const& t) const { return !(*this == t || *this < t); }
template <typename T> explicit operator T() { return get_if<T>(this) ? *get_if<T>(this) : T{}; }
friend ostream& operator<<(ostream& stream, SupportedDatatype const& v) {
if(get_if<long>(&v))
stream << *get_if<long>(&v);
if(get_if<string>(&v))
stream << *get_if<string>(&v);
if(get_if<double>(&v))
stream << *get_if<double>(&v);
return stream;
}
};
struct Tuple : valarray<SupportedDatatype> {
// tuples are equal if all their values are equal
bool operator==(Tuple const& other) const {
return size() == other.size() && equal(begin(*this), end(*this), begin(other));
}
// an empty tuple is treated as false
operator bool() const { return size() > 0; }
// this is convenience stuff
using valarray<SupportedDatatype>::valarray;
Tuple& operator=(Tuple const& other) {
resize(other.size());
for(size_t i = 0; i < other.size(); i++)
(*this)[i] = other[i];
return *this;
}
Tuple& operator|=(Tuple const& other) {
if(other)
*this = other;
return *this;
}
bool operator<(Tuple const& other) const {
if(size() != other.size()) {
return size() < other.size();
}
for(size_t i = 0; i < size(); i++) {
if((*this)[i] < other[i]) {
return true;
} else if((*this)[i] > other[i]) {
return false;
}
}
return false;
}
bool operator>(Tuple const& other) const { return !(other == *this || other < *this); }
Tuple operator+(Tuple const& other) const {
Tuple result(size() + other.size());
size_t i;
for(i = 0; i < size(); i++)
result[i] = (*this)[i];
for(size_t j = 0; j < other.size(); j++)
result[i + j] = other[j];
return result;
}
friend ostream& operator<<(ostream& stream, Tuple const& t) {
for (auto it = begin(t); it != prev(end(t)); ++it)
cout << *it << ", ";
return stream << *prev(end(t));
}
};
using Table = valarray<Tuple>;
struct Operator {
virtual void open() = 0;
virtual Tuple next() = 0;
virtual void close() = 0;
virtual ~Operator(){};
};
struct Scan : Operator {
Table input;
size_t nextTupleIndex = 0;
Scan(Table input) : input(input){};
void open(){};
Tuple next() {
return nextTupleIndex < input.size() //
? input[nextTupleIndex++]
: Tuple{};
};
void close(){};
};
using Projection = function<Tuple(Tuple)>;
struct Project : Operator {
Projection projection;
unique_ptr<Operator> child;
void open() { child->open(); };
Tuple next() { return projection(child->next()); };
void close() { child->close(); };
};
using Predicate = function<bool(Tuple)>;
struct Select : Operator {
unique_ptr<Operator> child;
Predicate predicate;
Select(unique_ptr<Operator> child, Predicate predicate)
: child(move(child)), predicate(predicate){};
void open() { child->open(); };
Tuple next() {
for(auto nextCandidate = child->next(); nextCandidate; //
nextCandidate = child->next())
if(predicate(nextCandidate))
return nextCandidate;
return {};
};
void close() { child->close(); };
};
struct Union : Operator {
unique_ptr<Operator> leftChild;