#include <fstream>
#include <cmath>
#include <vulkan/vulkan.hpp>
#include <experimental/optional>
#include "gpu_conv.hpp"
uint32_t kernelSize {3};
uint32_t inputSize { 1024 };
uint32_t workgroupSize_m { 1024 };
vk::UniqueInstance instance;
vk::PhysicalDevice physical_dev;
vk::UniqueDevice device;
class Error : public std::runtime_error {
public:
using std::runtime_error::runtime_error;
};
uint32_t outputSize(){
return inputSize - (kernelSize-1);
}
uint32_t workGroupSize(){
const auto limits = physical_dev.getProperties().limits;
const auto sizeMax = std::min<uint32_t>
({
limits.maxComputeWorkGroupSize[0],
limits.maxComputeWorkGroupSize[1],
static_cast<uint32_t>(sqrt(limits.maxComputeWorkGroupInvocations))
});
const auto countMax = std::min<uint32_t>(limits.maxComputeWorkGroupCount[0],
limits.maxComputeWorkGroupCount[1]);
const auto sizeMin = std::max<uint32_t>(1, (outputSize() - 1) / countMax + 1);
if (sizeMax < sizeMin) {
throw Error("no possible valid work group size");
}
return std::min(std::max(sizeMin, workgroupSize_m), sizeMax);
}
std::vector<char> readFile(const std::string& filename){
std::ifstream file(filename, std::ios::ate | std::ios::binary);
if (!file.is_open()) {
throw Error("failed to open file!");
}
std::vector<char> buffer(file.tellg());
file.seekg(0);
file.read(buffer.data(), buffer.size());
assert(file.gcount() == static_cast<ssize_t>(buffer.size()));
file.close();
return buffer;
}
std::experimental::optional<uint32_t>
get_queue_family_index(const vk::PhysicalDevice& device,
vk::QueueFlagBits type){
auto queueFamilies = device.getQueueFamilyProperties();
auto iter = std::find_if(queueFamilies.begin(), queueFamilies.end(),
[type](auto& q) {
return q.queueCount > 0 && q.queueFlags | type;
});
if (iter == queueFamilies.end()) return {};
return iter - queueFamilies.begin();
}
uint32_t queue_family_index(const vk::PhysicalDevice &physicalDevice){
return get_queue_family_index(physicalDevice,
vk::QueueFlagBits::eCompute).value();
}
/* ****************** */
vk::UniqueInstance create_unique_instance(){
vk::ApplicationInfo appInfo("Convolution Test");
appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
appInfo.apiVersion = VK_API_VERSION_1_0;
vk::InstanceCreateInfo createInfo;
createInfo.pApplicationInfo = &appInfo;
return vk::createInstanceUnique(createInfo);
}
vk::PhysicalDevice pick_physical_dev(vk::UniqueInstance *instance){
const auto devices = (*instance)->enumeratePhysicalDevices();
const auto device = find_if(devices.begin(), devices.end(),
[](auto& device) {
return get_queue_family_index(device, vk::QueueFlagBits::eCompute);
});
if (device == devices.end()) throw Error("No suitable GPU found");
return *device;
}
vk::UniqueDevice create_logical_device(uint32_t queueFamilyIndex,
vk::PhysicalDevice physicalDevice){
vk::DeviceQueueCreateInfo queueCreateInfo;
queueCreateInfo.queueFamilyIndex = queueFamilyIndex;
queueCreateInfo.queueCount = 1;
float queuePriority = 1;
queueCreateInfo.pQueuePriorities = &queuePriority;
vk::PhysicalDeviceFeatures deviceFeatures;
vk::DeviceCreateInfo createInfo;
createInfo.pQueueCreateInfos = &queueCreateInfo;
createInfo.queueCreateInfoCount = 1;
createInfo.pEnabledFeatures = &deviceFeatures;
return physicalDevice.createDeviceUnique(createInfo);
}
vk::UniqueDescriptorSetLayout create_descriptor_set_layout(){
const std::array<vk::DescriptorSetLayoutBinding, 3> setLayoutBindings
{{
// binding,type,count,flags
{0, vk::DescriptorType::eStorageBuffer, 1,
vk::ShaderStageFlagBits::eCompute},
{1, vk::DescriptorType::eStorageBuffer, 1,
vk::ShaderStageFlagBits::eCompute},
{2, vk::DescriptorType::eStorageBuffer, 1,
vk::ShaderStageFlagBits::eCompute}
}};
return device->createDescriptorSetLayoutUnique
({
vk::DescriptorSetLayoutCreateFlags(),
static_cast<uint32_t>(setLayoutBindings.size()),
setLayoutBindings.data()
});
}
vk::UniqueShaderModule load_shader(const char *filename, vk::Device &device){
const auto shaderCode = readFile(filename);
return device.createShaderModuleUnique(
{ {}, shaderCode.size(),
reinterpret_cast<const uint32_t*>(shaderCode.data()) },
nullptr);
}
vk::UniquePipeline init_pipeline(vk::UniquePipelineLayout *pipelineLayout,
vk::UniqueShaderModule *shaderModule){
// Pass SSBO size via specialization constant
struct SpecializationData {
uint32_t kernelSize;
uint32_t outputSize;
uint32_t workGroupSize;
};
const std::array<vk::SpecializationMapEntry, 3> specializationMapEntries
{{
{0, offsetof(SpecializationData, kernelSize),
sizeof(SpecializationData::kernelSize)},
{1, offsetof(SpecializationData, outputSize),
sizeof(SpecializationData::outputSize)},
{2, offsetof(SpecializationData, workGroupSize),
sizeof(SpecializationData::workGroupSize)}
}};
const SpecializationData specializationData { kernelSize,
outputSize(),
workGroupSize() };
const vk::SpecializationInfo specializationInfo {
static_cast<uint32_t>(specializationMapEntries.size()),
specializationMapEntries.data(), sizeof(SpecializationData),
&specializationData
};
vk::PipelineShaderStageCreateInfo shaderStage;
shaderStage.stage = vk::ShaderStageFlagBits::eCompute;
shaderStage.module = **shaderModule;
shaderStage.pName = "main";
shaderStage.pSpecializationInfo = &specializationInfo;
vk::ComputePipelineCreateInfo computePipelineCreateInfo;
computePipelineCreateInfo.setLayout(**pipelineLayout);
computePipelineCreateInfo.stage = shaderStage;
return device->createComputePipelineUnique(nullptr,
computePipelineCreateInfo);
}
int main(int argc, char* argv[]){
instance = create_unique_instance();
physical_dev = pick_physical_dev(&instance);
device = create_logical_device
(
queue_family_index(physical_dev),
physical_dev
);
vk::Queue queue = device->getQueue(queue_family_index(physical_dev),
0);
vk::UniqueCommandPool command_pool =
device->createCommandPoolUnique
(vk::CommandPoolCreateInfo
(vk::CommandPoolCreateFlagBits::eResetCommandBuffer,
queue_family_index(physical_dev)));
vk::UniqueDescriptorSetLayout descriptor_set_layout =
create_descriptor_set_layout();
vk::UniquePipelineLayout pipeline_layout =
device->createPipelineLayoutUnique({vk::PipelineLayoutCreateFlags(),
1,
&*descriptor_set_layout});
vk::UniqueShaderModule shader_module =
load_shader("shaders/convolution.spv", *device);
vk::UniquePipeline pipeline = init_pipeline(&pipeline_layout, &shader_module);
std::array<vk::DescriptorPoolSize, 1> poolSizes {{
{vk::DescriptorType::eStorageBuffer, 3}
}};
struct Resources {
vku::Resource input;
vku::Resource kernel;
vku::Resource output;
};
Resources resources = {
{*device, physicalDevice,
vk::BufferUsageFlagBits::eStorageBuffer,
vk::MemoryPropertyFlagBits::eHostVisible,
inputSize*inputSize*sizeof(float)},
{*device, physicalDevice,
vk::BufferUsageFlagBits::eStorageBuffer,
vk::MemoryPropertyFlagBits::eHostVisible,
kernelSize*kernelSize*sizeof(float)},
{*device, physicalDevice,
vk::BufferUsageFlagBits::eStorageBuffer,
vk::MemoryPropertyFlagBits::eHostVisible,
outputSize()*outputSize()*sizeof(float)}};
vk::UniqueDescriptorPool descriptorPool
= device->createDescriptorPoolUnique({vk::DescriptorPoolCreateFlags(), 1 ,
static_cast<uint32_t>(poolSizes.size()), poolSizes.data()});
// not unique since they are deallocated when the pool is destroyed
std::vector<vk::DescriptorSet> descriptorSet = createDescriptorSets();
}