uint32_t mulAddWWW(uint32_t x,uint32_t y, uint32_t c, uint32_t *z0) {
	uint32_t zz0 = x * y;
	*z0 = zz0 + c;
	uint32_t c1 = !!(*z0 < zz0);
	return mad_hi(x, y, c1);//返回(x*y的上半部分,并且加上c1)
}

uint32_t addWW(uint32_t x, uint32_t y, uint32_t c, uint32_t *z0) {
	uint32_t z1 = 0;
	uint32_t yc = y + c;
	*z0 = x + yc;
	if (*z0 < x || yc < y) {
		z1 = 1;
	}
	return z1;
}

uint32_t addMulVVW(uint32_t *z, uint32_t n, constant uint32_t *x, uint32_t y) {
	uint32_t c = 0;
	for (int i = 0; i < n; i++) {
		uint32_t z0;
		uint32_t z1 = mulAddWWW(x[i], y, z[i], &z0);
		c = addWW(z0, c, 0, &z[i]);
		c += z1;
	}
	return c;
}

uint32_t addMulVVWZero(uint32_t *z, uint32_t n, constant uint32_t *x, uint32_t y) {
	uint32_t c = 0;
	for (int i = 0; i < n; i++) {
		uint32_t z0;
		uint32_t z1 = mulAddWWW(x[i], y, 0, &z0);
		c = addWW(z0, c, 0, &z[i]);
		c += z1;
	}
	return c;
}


//p has len 512 bits
//x has len 512 bits (16 words)
//y has len 256 bits (8 words)
//p = x * y
void basicMul(uint32_t *p, constant uint32_t *x, uint32_t *y) {
    uint32_t d = y[7];
    addMulVVWZero(&p[0], MUL_CONSTANT_SIZE-0, x, d);

	for (int i = 1; i < 8; i++) {
		uint32_t d = y[7-i];
        addMulVVW(&p[i], MUL_CONSTANT_SIZE-i, x, d);
	}
}

int divisible(uint32_t *x, constant uint32_t *c, constant uint32_t *cSub1) {
	uint32_t p[16];
	basicMul(p, c, x);
	int i = MUL_CONSTANT_SIZE-1;
	while(i > 0 && p[i] == cSub1[i]) {
		i--;
	}
	return p[i] <= cSub1[i];
}