CUDA-Hough-Transform/AhTwoArraysToMatrix.cu at master · AndiH/CUDA-Hough-Transform · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
#include "AhTwoArraysToMatrix.h"
#include "AhTranslatorFunction.h"

/* TODO
 *
 * find out why old idea of functions doesnt work
 *
 * deliver two functions:
 * 	give translated value
 * 	give array translated VALUES
 *
 * peak finder
 */


AhTwoArraysToMatrix::AhTwoArraysToMatrix()
{}

AhTwoArraysToMatrix::AhTwoArraysToMatrix(thrust::host_vector<double> xValues, thrust::host_vector<double> yValues, int nbinsx, double xlow, double xup, int nbinsy, double ylow, double yup, bool doTiming, bool doBoundaryCheck)
: fXValues(xValues),
  fYValues(yValues),
  fNBinsX(nbinsx),
  fNBinsY(nbinsy),
  fXlow(xlow),
  fXup(xup),
  fYlow(ylow),
  fYup(yup),
  fDoTiming(doTiming),
  fReTranslationHasBeenDone(false)
{
	bool dontbreak = true;

	fXStepWidth = (fXup - fXlow)/(fNBinsX);
	fYStepWidth = (fYup - fYlow)/(fNBinsY);

	if (doBoundaryCheck == true) dontbreak = DoBoundaryCheck();

	if (dontbreak == true) {
		DoTranslations();

		CalculateHistogram();
	} else {
		std::cout << "Did nothing due to some error." << std::endl;
	}

}


AhTwoArraysToMatrix::~AhTwoArraysToMatrix()
{}

bool AhTwoArraysToMatrix::DoBoundaryCheck() {
	bool everythingIsOk = true;

	double minimumX = *(thrust::min_element(fXValues.begin(), fXValues.end()));
	double maximumX = *(thrust::max_element(fXValues.begin(), fXValues.end()));
	double minimumY = *(thrust::min_element(fYValues.begin(), fYValues.end()));
	double maximumY = *(thrust::max_element(fYValues.begin(), fYValues.end()));

	if (minimumX < fXlow || maximumX > fXup || minimumY < fYlow || maximumY > fYup) {
		std::cout << "Values are out of boundaries! Breaking!" << std::endl;
		std::cout << "  " << minimumX << " !>= " << fXlow << ",   " << maximumX << "!<=" << fXup << std::endl;
		std::cout << "  " << minimumY << " !>= " << fYlow << ",   " << maximumY << "!<=" << fYup << std::endl;
		everythingIsOk = false;
	}

	return everythingIsOk;
}

void AhTwoArraysToMatrix::DoTranslations()
{
	if (true == fDoTiming) {
		fSwTranslateValues = new TStopwatch(); // Old version with TStopwatch, kept for backwards compability

		cudaEventCreate(&start);
		cudaEventCreate(&stop);
		cudaThreadSynchronize(); // make sure everything is ready
		cudaEventRecord(start, 0); // create start point
	}

	fTranslatedXValues = TranslateValuesToMatrixCoordinates(fXValues, 1/fXStepWidth, fXlow);
	fTranslatedYValues = TranslateValuesToMatrixCoordinates(fYValues, 1/fYStepWidth, fYlow);

	if (true == fDoTiming) {
		fSwTranslateValues->Stop(); // Old
		cudaThreadSynchronize();

		cudaEventRecord(stop, 0); // create stop point
		cudaEventSynchronize(stop); // wait for stop to finish
		cudaEventElapsedTime(&fTimeTranslateValues, start, stop); // in mili seconds
		cudaEventDestroy(start);
		cudaEventDestroy(stop);
	}

	// ### DEBUG OUTPUT
// 	for (int i = 0; i < fTranslatedXValues.size(); i++) {
// 		std::cout << "(x,y)_" << i << " = (" << fXValues[i] << "," << fYValues[i] << ") --> (" << fTranslatedXValues[i] << "," << fTranslatedYValues[i] << ")" << std::endl;
// 	}
}

thrust::device_vector<int> AhTwoArraysToMatrix::TranslateValuesToMatrixCoordinates (const thrust::device_vector<double> &values, double inverseStepWidth, double lowValue) {
	thrust::device_vector<int> tempVec(values.size());

	thrust::transform(values.begin(), values.end(), tempVec.begin(), AhTranslatorFunction(inverseStepWidth, lowValue));

	return tempVec;
}

thrust::device_vector<double> AhTwoArraysToMatrix::RetranslateValuesFromMatrixCoordinates (const thrust::device_vector<int> &values, double inverseStepWidth, double lowValue) {
	thrust::device_vector<double> tempVec(values.size());

	thrust::transform(values.begin(), values.end(), tempVec.begin(), AhTranslatorFunction(inverseStepWidth, lowValue)); // uses operator for INTs (which is the retranslation operator)

	return tempVec;
}

void AhTwoArraysToMatrix::CalculateHistogram()
{
	thrust::device_vector<int> weightVector(fTranslatedXValues.size(), 1); // just dummy -- each cell should have weight of 1 // TODO: This, once, of course can be changed to support different weightes

	// allocate storage for unordered triplets
	cusp::array1d<int, cusp::device_memory> I(fTranslatedXValues.begin(), fTranslatedXValues.end());  // row indices
	cusp::array1d<int, cusp::device_memory> J(fTranslatedYValues.begin(), fTranslatedYValues.end());  // column indices
	cusp::array1d<double, cusp::device_memory> V(weightVector.begin(), weightVector.end());  // values

	if (true == fDoTiming) {
		fSwHistSort = new TStopwatch(); // old

		cudaEventCreate(&start);
		cudaEventCreate(&stop);
		cudaThreadSynchronize();
		cudaEventRecord(start, 0);
	}
	// sort triplets by (i,j) index using two stable sorts (first by J, then by I)
	/** The Trick (of following line):
	 *
	 * Generally speaking, sort_by_key sorts a vector A in ascending order, while also putting the corresponding (paired) element of a second vector B into the new position -- leading to vector A being sorted after your wish and a vector B, of which each element is 'chained' to the original one in A in therefore being sorted the way A is.
	 *
	 * stable_sort_by_key will preserve the original order if two (or more) elements are equal (this is not important for this case, I guess)
	 *
	 * So, in the first example
	 * 	J is sorted by 'operator<'
	 * 	At the same time the tuple of (I,V) is put in J's order
	 *
	 * The second example sorts I, so that in the end I and J are both sorted ascending
	 *
	 * Further read: http://docs.thrust.googlecode.com/hg/group__sorting.html#ga2bb765aeef19f6a04ca8b8ba11efff24
	 */
	thrust::stable_sort_by_key(
		J.begin(),
		J.end(),
		thrust::make_zip_iterator(
			thrust::make_tuple(
				I.begin(),
				V.begin()
			)
		)
	);
	thrust::stable_sort_by_key(
		I.begin(),
		I.end(),
		thrust::make_zip_iterator(
			thrust::make_tuple(
				J.begin(),
				V.begin()
			)
		)
	);
	if (true == fDoTiming) {
		fSwHistSort->Stop(); // old

		cudaEventRecord(stop, 0);
		cudaEventSynchronize(stop);
		cudaEventElapsedTime(&fTimeHistSort, start, stop);
		cudaEventDestroy(start);
		cudaEventDestroy(stop);
	}
	// compute unique number of nonzeros in the output
	int num_entries = thrust::inner_product(
		thrust::make_zip_iterator(
			thrust::make_tuple(I.begin(), J.begin())
		),
		thrust::make_zip_iterator(
			thrust::make_tuple(I.end (),  J.end())
		) - 1,
		thrust::make_zip_iterator(
			thrust::make_tuple(I.begin(), J.begin())
		) + 1,
		int(0),
		thrust::plus<int>(),
		thrust::not_equal_to< thrust::tuple<int,int> >()
	) + 1;

	// allocate output matrix
	cusp::coo_matrix<int, double, cusp::device_memory> A(fNBinsX, fNBinsY, num_entries);

	if (true == fDoTiming) {
		fSwHistSum = new TStopwatch(); // old

		cudaEventCreate(&start);
		cudaEventCreate(&stop);
		cudaThreadSynchronize();
		cudaEventRecord(start, 0);
	}
	// sum values with the same (i,j) index
	/** The Trick (of the following line):
	 * reduce_by_key sums up values V(I,J), if two (three...n-2) consecutive key_parameters (=tuples) (I_0..n,J_0..n) are equal
	 * if so, the matching key_tuples (I_0,J_0) are put into the fourth argument (==matrix indices) and the summed up values of V(I_0..n,J_0..n) are put into the fifth argument (==matrix values).
	 *
	 * equalness (equality?) is tested by invoking the second-to-last argument,
	 * reduction is done by invoking th elast argument
	 *
	 * See this example for clarification: http://docs.thrust.googlecode.com/hg/group__reductions.html#ga633d78d4cb2650624ec354c9abd0c97f
	 */
	thrust::reduce_by_key(
		thrust::make_zip_iterator(
			thrust::make_tuple(I.begin(), J.begin())
		),
		thrust::make_zip_iterator(
			thrust::make_tuple(I.end(),   J.end())
		),
		V.begin(),
		thrust::make_zip_iterator(
			thrust::make_tuple(A.row_indices.begin(), A.column_indices.begin())
		), // TODO: maybe it makes sense to put a tuple of vectors in here? investigate for performance
		A.values.begin(), // TODO: maybe it makes sense to put a plain vector in here? then combine this one and the tuple from the line above into a matrix
		thrust::equal_to< thrust::tuple<int,int> >(),
		thrust::plus<double>()
	);
	if (true == fDoTiming) {
		fSwHistSum->Stop(); // old

		cudaEventRecord(stop, 0);
		cudaEventSynchronize(stop);
		cudaEventElapsedTime(&fTimeHistSum, start, stop);
		cudaEventDestroy(start);
		cudaEventDestroy(stop);
	}

	fCUSPMatrix = A;
	fI = I;
	fJ = J;
	fV = V;
}

TMatrixD AhTwoArraysToMatrix::GetTMatrixD()
{
	if (true == fDoTiming) fSwCreateTMatrixD = new TStopwatch();
	TMatrixD myMatrix(fNBinsY, fNBinsX);
//	std::cout << myMatrix.GetRowUpb() << " "<< myMatrix.GetColUpb() << std::endl;
//	std::cout << fCUSPMatrix.num_rows << " " << fCUSPMatrix.num_cols << std::endl;
	for (int i = 0; i < fCUSPMatrix.num_entries; i++) {
		int column_index = fCUSPMatrix.column_indices[i];
		int row_index = fCUSPMatrix.row_indices[i];
		int value = fCUSPMatrix.values[i];

//		std::cout << i << " - columindex = " << column_index << ", rowindex = " <<  row_index << ", value = " << value << std::endl;

		if (column_index <= fNBinsY && column_index >= 0 && row_index <= fNBinsX && row_index >= 0) {
			myMatrix[column_index][row_index] = value;
		} else {
//			std::cout << "matrix element not filled" << std::endl;
		}
	}
	if (true == fDoTiming) fSwCreateTMatrixD->Stop();

	return myMatrix;
}

TH2D * AhTwoArraysToMatrix::GetHistogram()
{
	// Gives a pointer to a TH2D histogram with proper, re-translated boundaries
	if (true == fDoTiming) fSwCreateTH2D = new TStopwatch();

	TH2D * tempHisto = new TH2D(GetTMatrixD());
	tempHisto->GetXaxis()->SetLimits(fXlow, fXlow + fNBinsX * fXStepWidth);
	tempHisto->GetYaxis()->SetLimits(fYlow, fYlow + fNBinsY * fYStepWidth);

	if (true == fDoTiming) fSwCreateTH2D->Stop();

	return tempHisto;
}

/**
 * @brief Return all important values of constructed matrix in a list-like fashion
 * @return A device vector of a thrust::tuple with x-value (int), y-value (int) and the multiplicity of that bin (double)
 *
 * Example access: thrust::get<0>(this->GetPlainMatrixValues()[0])
 */
thrust::device_vector<thrust::tuple<int, int, double> > AhTwoArraysToMatrix::GetPlainMatrixValues()
{
	thrust::device_vector< thrust::tuple<int, int, double> > allStuff(
		thrust::make_zip_iterator(
			thrust::make_tuple(
				fCUSPMatrix.row_indices.begin(),
				fCUSPMatrix.column_indices.begin(),
				fCUSPMatrix.values.begin()
			)
		),
		thrust::make_zip_iterator(
			thrust::make_tuple(
				fCUSPMatrix.row_indices.end(),
				fCUSPMatrix.column_indices.end(),
				fCUSPMatrix.values.end()
			)
		)
	);
	return allStuff;
}

void AhTwoArraysToMatrix::DoRetranslations()
{
	fRetranslatedXValues = RetranslateValuesFromMatrixCoordinates(CuspVectorToDeviceVector(fCUSPMatrix.row_indices), 1/fXStepWidth, fXlow);
	fRetranslatedYValues = RetranslateValuesFromMatrixCoordinates(CuspVectorToDeviceVector(fCUSPMatrix.column_indices), 1/fYStepWidth, fYlow);
}

thrust::device_vector<thrust::tuple<double, double, double> > AhTwoArraysToMatrix::GetRetranslatedMatrixValues()
{
	// First off, translate I and J from matrix int space into real double space
	if (false == fReTranslationHasBeenDone) {
		DoRetranslations();
		fReTranslationHasBeenDone = true;
	}

	// make dev vec tuple construct
	thrust::device_vector< thrust::tuple<double, double, double> > allStuff(
		thrust::make_zip_iterator(
			thrust::make_tuple(
				fRetranslatedXValues.begin(),
				fRetranslatedYValues.begin(),
				fCUSPMatrix.values.begin()
			)
		),
		thrust::make_zip_iterator(
			thrust::make_tuple(
				fRetranslatedXValues.end(),
				fRetranslatedYValues.end(),
				fCUSPMatrix.values.end()
			)
		)
	);
	return allStuff;
}

template <class T>
thrust::device_vector<T> AhTwoArraysToMatrix::CuspVectorToDeviceVector(const cusp::array1d<T, cusp::device_memory> &cuspvec)
{
	thrust::device_vector<T> values(cuspvec.begin(), cuspvec.end());
	return values;
}

thrust::device_vector<int> AhTwoArraysToMatrix::GetPlainXValues()
{
	return CuspVectorToDeviceVector<int>(fJ);
}

thrust::device_vector<int> AhTwoArraysToMatrix::GetPlainYValues()
{
	return CuspVectorToDeviceVector<int>(fI);
}

thrust::device_vector<double> AhTwoArraysToMatrix::GetMultiplicities()
{
	return CuspVectorToDeviceVector<double>(fV);
}