// 2014-2022, Yakov Galka.
// This work has been released into the public domain.
#include <assert.h>
#include <err.h>
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

// These public domain libraries are available from https://github.com/nothings/stb/
#include <stb_image.h>
#include <stb_image_write.h>

// Using FFTW3 (GPL) for FFT
#include <fftw3.h>

#define ALLOC(T, n) ((T*)malloc(sizeof(T)*n))
#define ALLOCZ(T, n) ((T*)calloc(sizeof(T), n))

int ends_with(const char *x, const char *y) {
	size_t nx = strlen(x), ny = strlen(y);
	return nx >= ny && memcmp(x + nx - ny, y, ny) == 0;
}

int modi(int x, int y) {
	int r = x%y;
	return r >= 0 ? r : r + y;
}

int cmpf(const void *p, const void *q) {
	float x = *(const float*)p, y = *(const float*)q;
	return (x > y) - (x < y);
}

float *load_file(const char *filename, int *w, int *h, int dc)
{
	int nc;
	stbi_uc *src = stbi_load(filename, w, h, &nc, dc);
	if(!src) err(1, "can't load '%s", filename);

	int size = *w * *h * dc;
	float *ret = ALLOC(float, size);
	for(int i = 0; i < size; ++i)
		ret[i] = (src[i] - 128.)/127.;
	free(src);
	return ret;
}

int write_file(const char *filename, int w, int h, int nc, const float *data)
{
	int size = w*h*nc;
	uint8_t *data8 = ALLOC(uint8_t, size);
	for(int i = 0; i < size; ++i)
		data8[i] = lround(fminf(fmaxf(128 + 127*data[i], 0), 255));

	int ret = 0;
	if(ends_with(filename, ".png"))
		ret = stbi_write_png(filename, w, h, nc, data8, w*nc);
	else if(ends_with(filename, ".bmp"))
		ret = stbi_write_bmp(filename, w, h, nc, data8);
	else if(ends_with(filename, ".tga"))
		ret = stbi_write_tga(filename, w, h, nc, data8);
	else if(ends_with(filename, ".jpg"))
		ret = stbi_write_jpg(filename, w, h, nc, data8, 95);
	else
		fputs("can't deduce output format\n", stderr);

	free(data8);
	return ret;
}

void convolve(const float *in, int w, int h, const float *k, int kw, int kh, float *out)
{
	for(int y = 0; y < h; ++y)
	for(int x = 0; x < w; ++x)
	{
		float dx = 0, dy = 0;
		for(int ky = 0; ky < kh; ++ky)
		for(int kx = 0; kx < kw; ++kx)
		{
			dx += in[modi(y - ky + kh/2, h)*w + modi(x - kx + kw/2, w)]*k[ky*kw + kx];
			dy += in[modi(y - kx + kw/2, h)*w + modi(x - ky + kh/2, w)]*k[ky*kw + kx];
		}
		float *p = out + 3*(y*w + x);
		p[0] = dx, p[1] = dy, p[2] = 1;
	}
}

void deconvolve(const float *n, int w, int h, const float *k, int kw, int kh, float threshold, float highpass,  float *out)
{
	float *nx = ALLOC(float, w*h);
	float *ny = ALLOC(float, w*h);
	for(int i = 0, mi = w*h; i < mi; ++i) {
		const float *p = n + 3*i;
		float d = fmaxf(p[2], 1/127.);
		nx[i] = p[0]/d, ny[i] = p[1]/d;
	}


	float *dx = ALLOCZ(float, w*h);
	float *dy = ALLOCZ(float, w*h);
	for(int y = 0; y < kh; ++y)
	for(int x = 0; x < kw; ++x)
	{
		dx[y%h*w + modi(x - kw/2, w)] += k[y*kw + x];
		dy[modi(x - kw/2, h)*w + y%w] += k[y*kw + x];
	}

	fftwf_plan plan;
	const int w2= w/2 + 1;
	typedef fftwf_complex C;
	C *Nx = ALLOC(C, w2*h), *Ny = ALLOC(C, w2*h);
	C *Dx = ALLOC(C, w2*h), *Dy = ALLOC(C, w2*h);

	plan = fftwf_plan_dft_r2c_2d(h, w, nx, Nx, FFTW_ESTIMATE);
	fftwf_execute_dft_r2c(plan, nx, Nx);
	fftwf_execute_dft_r2c(plan, ny, Ny);
	fftwf_execute_dft_r2c(plan, dx, Dx);
	fftwf_execute_dft_r2c(plan, dy, Dy);
	fftwf_destroy_plan(plan);

	int nzeros = 0;
	int rr = ceil(highpass*highpass);
	C *F = ALLOC(C, w2*h);
	for(int y = 0; y < h; ++y)
	for(int x = 0; x < w2; ++x)
	{
		int i = y*w2 + x;
		float *Dxi = Dx[i], *Dyi = Dy[i], *Nxi = Nx[i], *Nyi = Ny[i];
		float denom = w * h * (Dxi[0]*Dxi[0] + Dxi[1]*Dxi[1] + Dyi[0]*Dyi[0] + Dyi[1]*Dyi[1]);
		if(denom <= threshold || x*x + y*y < rr || x*x + (h-y)*(h-y) < rr)
		{
			++nzeros;
			F[i][0] = F[i][1] = 0;
		}
		else
		{
			F[i][0] = (Dxi[0]*Nxi[0] + Dxi[1]*Nxi[1] + Dyi[0]*Nyi[0] + Dyi[1]*Nyi[1]) / denom;
			F[i][1] = (Dxi[0]*Nxi[1] - Dxi[1]*Nxi[0] + Dyi[0]*Nyi[1] - Dyi[1]*Nyi[0]) / denom;
		}
	}

	printf("zeros: %d/%d\n", nzeros, w2*h);

	plan = fftwf_plan_dft_c2r_2d(h, w, F, out, FFTW_ESTIMATE);
	fftwf_execute(plan);
	fftwf_destroy_plan(plan);
	free(nx), free(ny), free(dx), free(dy);
	free(Nx), free(Ny), free(Dx), free(Dy);
	free(F);
}

void help(char *name, FILE *f)
{
	fprintf(f,
"Usage:\n"
"    %s [ --<options> ] [ conv | deconv ] <input> <output>\n"
"Options:\n"
"    -y             Invert y; assumes y axis points 'up'.\n"
"    -n             Normalize output:\n"
"                       For normals so that norm(xyz) = 1 (otherwise z=1)\n"
"                       For height maps so that stdev = 1/3\n"
"    -1             Set kernel to one sided derivatives [-.5 .5] (default)\n"
"    -2             Set kernel to central derivatives [-.5 0 .5]\n"
"    -k <filename>  Set custom kernel for x derivatives (y is transposed)\n"
"    -t <real>      Threshold (default 0)\n"
"    -p <real>      High-pass (default 1)\n"
"Notes:\n"
"    Input is read as integers in [0,255], then mapped to [-1,1].\n"
		, name);
}

static const char kernel1 = 0, kernel2 = 0;
float *load_kernel(const char *kernel, int *w, int *h)
{
	if(kernel != &kernel1 && kernel != &kernel2)
		return load_file(kernel, w, h, 1);

	*w = kernel == &kernel1 ? 2 : 3, *h = 1;
	float *p = ALLOCZ(float, *w);
	p[0] = -0.5;
	p[*w-1] = 0.5;
	return p;
}

int main(int argc, char *argv[])
{
	const char *kernel = &kernel1;
	int normalize = 0, inverty = 0;
	float threshold = 0;
	float highpass = 1;
	int kw, kh, w, h, snc, dnc;

	for(;;) {
		int opt = getopt(argc, argv, "yn12kt:p:");
		switch(opt) {
		case 'y': inverty = 1; continue;
		case 'n': normalize = 1; continue;
		case '1': kernel = &kernel1; continue;
		case '2': kernel = &kernel2; continue;
		case 'k': kernel = optarg; continue;
		case 't': threshold = atof(optarg); continue;
		case 'p': highpass = atof(optarg); continue;
		}
		break;
	}

	if(inverty) {
		stbi_set_flip_vertically_on_load(1);
		stbi_flip_vertically_on_write(1);
	}

	const char *verb  = optind < argc ? argv[optind] : "";
	if(strcmp(verb, "help") == 0)
		return help(argv[0], stdout), 0;
	else if(strcmp(verb, "conv") == 0 && argc - optind == 3)
		snc = 1, dnc = 3;
	else if(strcmp(verb, "deconv") == 0 && argc - optind == 3)
		snc = 3, dnc = 1;
	else
		return help(argv[0], stderr), 1;

	float *k = load_kernel(kernel, &kw, &kh);

	float *src = load_file(argv[optind+1], &w, &h, snc);
	float *dst = ALLOC(float, dnc*w*h);

	if(snc == 1 && dnc == 3)
		convolve(src, w, h, k, kw, kh, dst);
	else if(snc == 3 && dnc == 1)
		deconvolve(src, w, h, k, kw, kh, threshold, highpass, dst);
	else {
		assert(snc == dnc);
		memcpy(dst, src, sizeof(float)*dnc*w*h);
	}

	if(normalize) {
		if(dnc == 3) {
			for(int i = 0, n = w*h; i < n; ++i) {
				float *p = dst + 3*i;
				float f = 1/sqrtf(p[0]*p[0] + p[1]*p[1] + p[2]*p[2]);
				p[0] *= f, p[1] *= f, p[2] *= f;
			}
		} else {
			int n = w*h*dnc;
			float *v = ALLOC(float, n);
			memcpy(v, dst, sizeof(float)*n);
			qsort(v, n, sizeof(float), cmpf);
			float a = v[n/4], b = v[3*n/4];
			float m = 2*a - b, M = 2*b - a;
			for(int i = 0; i < n; ++i) dst[i] = 2*(dst[i] - m)/(M - m) - 1;
		}
	}

	if(!write_file(argv[optind+2], w, h, dnc, dst)) err(1, "write_file failed");

	return 0;
}