Issue:
内存只能容纳100W个数,
现在有1亿数,混序,
然后求这一亿个数的中位数,
中位数(就是一个有序数组的中间数字,数组长度为偶数就是两个,奇数则只有一个)
/*****************************************************************************/
/* this program should find out the median(s) in N(logN)/(logB) no matter there
/* is duplicated data or not.
/*
/* with memory limitation: 4MB memory available. in fact, it can work on 1MB.
/* I would like to express the idea with code.
/* here is just the pseudocode with out any optimization for easy understanding.
/*
/* maybe, there is no need to check sum == N/2 or N/2+1, but this will get the
/* median(s) as soon as possible with out more recursive invoking.
/*
/* sorry for any logical problem. please let me know if you found any.
/*****************************************************************************/
int N = 100000000;// total data number is N
int B = 1000000; // number of buckets.
int min, max;
scan data from 0 to N-1 get min and max; // O(N);
// num the data number that is less than min
// min the minimum data in the target data range to find median(s)
// max the maximum data in the target data range to find median(s)
void find_median(int num=0, int min, int max)
{
if(min == max) {
if(N%2) {
print medians are min and max;
}
else print median is min; // show you the result
return; // exit
}
// count all the data in O(N)
int m = max-min > B ? ((max-min)%B ? (max-min)/B + 1 : (max-min)/B) : 1;
int cnt[B] = {0}; // count the data read.
while(!EOF) {
int data = read_data()-min;
// count data in this range [min, max]
// data/m will get the position of data to increase the cnt[?] value
if(data >= min && data <= max) cnt[data/m]++;
}
int sum = num, median_1, median_2, i = 0;
while(sum < N/2) sum += cnt[i++];
i--; // median is in the range of [i*m, (i+1)*m-1]
/* get median(s) when sum is N/2 */
if(sum == N/2) {
if(N%2) { // N is even and there are two medians.
median_1 = (i+2)*m-1;
median_2 = i*m;
while(!EOF) {
int data = read_data()-min;
if(data/m == i) {
median_2 = (data > median_2 ? data : median_2);
}
if(data/m == i+1) {
median_1 = (data < median_1 ? data : median_1);
}
}
pintf medians are median_1 and median_2;
return;
}
else { // N is an odd number and there is one median.
median_1 = (i+2)*m-1;
while(!EOF) {
int data = read_data();
if(data/m == i+1) median_1 = (data < median_1 ? data : median_1);
}
print median is median_1;
return;
}
}
/* get median(s) when sum is N/2+1 */
if(sum == N/2+1) {
if(N%2) { // N is even and there are two medians.
median_1 = i*m;
median_2 = i*m;
while(!EOF) {
int data = read_data();
if(data/m == i) {
median_1 = max;
median_2 = (data > median_2 ? data : median_2);
}
}
pintf medians are median_1 and median_2;
return;
}
else {
median_2 = i*m; // get (N/2+1)th data.
while(!EOF) {
int data = read_data();
if(data/m == i) median_2 = (data > median_2 ? data : median_2);
}
print median is median_2;
return;
}
}
// if sum is not N/2 or (N/2 + 1), recursively to find out the median(s)
min = (i+1)*m-1;
max = i*m;
// get min and max for next processing
while(!EOF)
{
int data = read_data()-min;
if(data/m == i)
{
min = (data < min ? data : min);
max = (data > max ? data : max);
}
}
find_median(sum, min, max);
}