openmp 32 스레드가 1 스레드보다 훨씬 느린 이유는 무엇입니까?

Dec 17 2020

2 배열의 l2 표준을 계산하는 응용 프로그램을 작성하려고합니다. 나는 내 계산을 병행해야한다.

다음은 내가 병렬화 한 코드입니다.

  double time_start_openmp = omp_get_wtime();
  #pragma omp parallel for
  for (i = 0; i < n; i++)
  {
       numberOfThreads = omp_get_num_threads();
       double local_diff = x[i] - xseq[i];
       diff_vector[i] = local_diff;
       l2_norm += (local_diff * local_diff);
  }

   time_end_openmp = omp_get_wtime();

   l2_norm = sqrt(l2_norm);

   openmp_exec_time = time_end_openmp - time_start_openmp;
   printf("OPENMP: %d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);

코드를 다음과 같이 컴파일합니다.

gcc -fopenmp -g -ggdb -Wall -lm -o test test.c

이 코드를 1 개의 스레드와 32 개의 스레드로 실행하고 있습니다. 출력은 예상되는 것과 정반대입니다. 다음은 출력 예입니다.

[hayri@hayri-durmaz MatrixMultipication_MPI]$ export OMP_NUM_THREADS=32 [hayri@hayri-durmaz MatrixMultipication_MPI]$ ./test 10000
OPENMP: 10000 32 0.001084 0.000000000000e+00
[hayri@hayri-durmaz MatrixMultipication_MPI]$ export OMP_NUM_THREADS=1 [hayri@hayri-durmaz MatrixMultipication_MPI]$ ./test 10000
OPENMP: 10000 1 0.000106 0.000000000000e+00

내가 잘못 보거나 32 개의 스레드를 사용하는 것이 1 개의 스레드보다 10 배 느립니까? 그래서 내가 여기서 뭘 잘못하고 있니?

내 전체 코드는 다음과 같습니다.

#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#include <math.h>

#define MATSIZE 2000

static size_t totalMemUsage = 0;

size_t vectors_dot_prod(double *x, double *y, size_t n)
{
    double res = 0.0;
    size_t i;
    for (i = 0; i < n; i++)
    {
        res += x[i] * y[i];
    }
    return res;
}

size_t vectors_dot_prod2(double *x, double *y, size_t n)
{
    size_t res = 0.0;
    size_t i = 0;
    for (; i <= n - 4; i += 4)
    {
        res += (x[i] * y[i] +
                x[i + 1] * y[i + 1] +
                x[i + 2] * y[i + 2] +
                x[i + 3] * y[i + 3]);
    }
    for (; i < n; i++)
    {
        res += x[i] * y[i];
    }
    return res;
}

void matrix_vector_mult(double **mat, double *vec, double *result, size_t rows, size_t cols)
{ // in matrix form: result = mat * vec;
    size_t i;
    for (i = 0; i < rows; i++)
    {
        result[i] = vectors_dot_prod2(mat[i], vec, cols);
    }
}

double get_random()
{

    double range = 1000;
    double div = RAND_MAX / range;
    double randomNumber = (rand() / div);
    // printf("%d\n", randomNumber);
    return randomNumber;
}

void print_2d_arr(double *arr, size_t row, size_t col)
{
    size_t i, j, index;

    for (i = 0; i < row; i++)
    {
        for (j = 0; j < col; j++)
        {
            index = i * col + j;
            printf("%3f ", arr[index]);
        }
        printf("\n");
    }
}
void print_1d_arr(double *arr, size_t row)
{
    size_t i;
    for (i = 0; i < row; i++)
    {
        printf("%f, ", arr[i]);
    }
    printf("\n");
}

size_t **fullfillArrayWithRandomNumbers(double *arr, size_t n)
{
    /*
    * Fulfilling the array with random numbers 
    * */
    size_t i;
    for (i = 0; i < n; i++)
    {
        arr[i] = get_random();
    }
    return 0;
}

double *allocarray1D(size_t size)
{
    double *array = calloc(size, sizeof(double));
    totalMemUsage = totalMemUsage + size * sizeof(double);
    return array;
}

size_t ParallelRowMatrixVectorMultiply(size_t n, double *a, double *b, double *x, MPI_Comm comm)
{
    size_t i, j;
    size_t nlocal;
    double *fb;
    int npes, myrank;
    MPI_Comm_size(comm, &npes);
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    fb = (double *)malloc(n * sizeof(double));
    nlocal = n / npes;
    MPI_Allgather(b, nlocal, MPI_DOUBLE, fb, nlocal, MPI_DOUBLE, comm);
    for (i = 0; i < nlocal; i++)
    {
        x[i] = 0.0;
        for (j = 0; j < n; j++)
        {
            size_t index = i * n + j;
            x[i] += a[index] * fb[j];
        }
    }
    free(fb);
    return 0;
}

size_t ParallelRowMatrixVectorMultiply_WithoutAllgather(size_t n, double *a, double *b, double *x_partial, double *x, MPI_Comm comm)
{

    // Process 0 sends b to everyone
    MPI_Bcast(b, n, MPI_DOUBLE, 0, MPI_COMM_WORLD);

    size_t i, j;
    size_t nlocal;
    // double *fb;
    int npes, myrank;
    MPI_Comm_size(comm, &npes);
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    // fb = (double *)malloc(n * sizeof(double));
    nlocal = n / npes;
    // MPI_Allgather(b, nlocal, MPI_DOUBLE, fb, nlocal, MPI_DOUBLE, comm);
    for (i = 0; i < nlocal; i++)
    {
        x_partial[i] = 0.0;
        for (j = 0; j < n; j++)
        {
            size_t index = i * n + j;
            // printf("%f x %f\n", a[index], b[j]);
            x_partial[i] += a[index] * b[j];
        }
    }
    // free(b);

    // Process 0 gathers x_partials to create x
    MPI_Gather(x_partial, nlocal, MPI_DOUBLE, x, nlocal, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    return 0;
}

size_t SequentialMatrixMultiply(size_t n, double *a, double *b, double *x)
{
    size_t i, j;
    for (i = 0; i < n; i++)
    {
        x[i] = 0.0;
        for (j = 0; j < n; j++)
        {
            size_t index = i * n + j;
            // printf("%f x %f\n", a[index], b[j]);
            x[i] += a[index] * b[j];
        }
    }
    return 0;
}

int main(int argc, char *argv[])
{
    // Global declerations
    size_t i;
    // MPI_Status status;

    // Initialize the MPI environment
    MPI_Init(&argc, &argv);

    // Get the number of processes
    int world_size;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);

    // Get the rank of the process
    int taskid;
    MPI_Comm_rank(MPI_COMM_WORLD, &taskid);

    // Get the name of the processor
    char processor_name[MPI_MAX_PROCESSOR_NAME];
    int name_len;
    MPI_Get_processor_name(processor_name, &name_len);

    if (argc != 2)
    {
        if (taskid == 0)
            printf("Usage: %s <N>\n", argv[0]);
        MPI_Finalize();
        return 0;
    }
    srand(time(NULL) + taskid);
    size_t n = atoi(argv[1]);
    size_t nOverK = n / world_size;

    double *a = allocarray1D(n * n);
    double *b = allocarray1D(n);
    double *x = allocarray1D(n);
    double *x_partial = allocarray1D(nOverK);
    double *xseq = allocarray1D(n);

    double *a_partial = allocarray1D(n * nOverK);

    if (a == NULL || b == NULL || x == NULL || xseq == NULL || x_partial == NULL)
    {
        if (taskid == 0)
            printf("Allocation failed\n");
        MPI_Finalize();
        return 0;
    }
    // Process 0 creates A matrix.
    if (taskid == 0)
    {
        fullfillArrayWithRandomNumbers(a, n * n);
        // Process 0 produces the b
        fullfillArrayWithRandomNumbers(b, n);
    }

    // Process 0 sends a_partial to everyone
    if (!(world_size == 1 && n == 64000))
    {
        MPI_Scatter(a, n * nOverK, MPI_DOUBLE, a_partial, n * nOverK, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    }

    MPI_Barrier(MPI_COMM_WORLD);
    double time_start = MPI_Wtime();
    ParallelRowMatrixVectorMultiply_WithoutAllgather(n, a_partial, b, x_partial, x, MPI_COMM_WORLD);
    double time_end = MPI_Wtime();
    double parallel_exec_time = time_end - time_start;

    double *exec_times = allocarray1D(world_size);
    // Process 0 gathers x_partials to create x
    MPI_Gather(&parallel_exec_time, 1, MPI_DOUBLE, exec_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    // print_1d_arr(x, n);

    if (taskid == 0)
    {
        SequentialMatrixMultiply(n, a, b, xseq);
        // check difference between x and xseq using OpenMP
        //print_1d_arr(exec_times, world_size);
        // print_1d_arr(xseq, n);
        double max_exec, min_exec, avg_exec;
        min_exec = 1000;
        for (i = 0; i < world_size; i++)
        {
            if (max_exec < exec_times[i])
            {
                max_exec = exec_times[i];
            }
            if (min_exec > exec_times[i])
            {
                min_exec = exec_times[i];
            }
            avg_exec += exec_times[i];
        }
        avg_exec = avg_exec / world_size;

        long double time_start_openmp = omp_get_wtime();
        long double time_end_openmp, openmp_exec_time, min_exec_time, max_exec_time, avg_exec_time;
        max_exec_time = 0;
        max_exec_time = 1000;
        long double l2_norm = 0;
        size_t numberOfThreads = 0;
        size_t r = 0;
        double *diff_vector = allocarray1D(n);
        size_t nrepeat = 10000;

        if (world_size == 1)
        {
            #pragma omp parallel
            {
                numberOfThreads = omp_get_num_threads();
                #pragma omp parallel for private(i)
                for (i = 0; i < n; i++)
                {
                    double local_diff = x[i] - xseq[i];
                    diff_vector[i] = local_diff;
                    l2_norm += (local_diff * local_diff);
                }
            }
        }
        else
        {
            #pragma omp parallel
            {
                numberOfThreads = omp_get_num_threads();
                #pragma omp parallel for private(i)
                for (i = 0; i < n; i++)
                {
                    double local_diff = x[i] - xseq[i];
                    diff_vector[i] = local_diff;
                    l2_norm += (local_diff * local_diff);
                }
            }
        }
        l2_norm = sqrt(l2_norm);
        time_end_openmp = omp_get_wtime();
        openmp_exec_time = time_end_openmp - time_start_openmp;
        // print matrix size, number of processors, number of threads, time, time_openmp, L2 norm of difference of x and xseq (use %.12e while printing norm)
        if (world_size == 1)
        {
            printf("OPENMP: %d %ld %Lf %.12e\n", n, numberOfThreads, openmp_exec_time, openmp_exec_time, l2_norm);
            printf("NEW_OPENMP: %d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);
        }
        printf("MIN_AVG_MAX: %d %d %f %f %f\n", n, world_size, min_exec, max_exec, avg_exec);
        printf("MPI: %d %d %f %.12Lf %.12e\n", n, world_size, max_exec, l2_norm, l2_norm);
        totalMemUsage = totalMemUsage / (1024 * 1024 * 1024);
        printf("TOTALMEMUSAGE: %zu\n", totalMemUsage);

        //printf("process: %d %d %d %f %.12e\n", taskid, n, world_size, parallel_exec_time, l2_norm);
        //printf("%d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);
    }
    MPI_Finalize();
    return 0;
}

다음은 출력입니다.


cn009
36
mpicc -fopenmp -g -ggdb  -lm -o rowmv rowmv.c 


OPENMP: 32000 1 0.000299 2.991110086441e-04
MIN_AVG_MAX: 32000 1 3.112523 3.112523 3.112523
MPI: 32000 1 3.112523 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15


OPENMP: 32000 2 0.000535 5.350699648261e-04
MIN_AVG_MAX: 32000 1 3.125519 3.125519 3.125519
MPI: 32000 1 3.125519 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15


OPENMP: 32000 4 0.000434 4.341900348663e-04
MIN_AVG_MAX: 32000 1 3.170650 3.170650 3.170650
MPI: 32000 1 3.170650 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15


OPENMP: 32000 8 0.000454 4.542167298496e-04
MIN_AVG_MAX: 32000 1 3.168685 3.168685 3.168685
MPI: 32000 1 3.168685 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15


OPENMP: 32000 16 0.000507 5.065393634140e-04
MIN_AVG_MAX: 32000 1 3.158761 3.158761 3.158761
MPI: 32000 1 3.158761 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15


OPENMP: 32000 32 0.000875 8.752988651395e-04
MIN_AVG_MAX: 32000 1 3.166051 3.166051 3.166051
MPI: 32000 1 3.166051 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15

답변

2 dreamcrash Dec 17 2020 at 01:33

내가 잘못 보거나 32 개의 스레드를 사용하는 것이 1 개의 스레드보다 10 배 느립니까? 그래서 내가 여기서 뭘 잘못하고 있니?

OpenMP로 프로파일 링되고 병렬화되는 코드 부분에서 :

 #pragma omp parallel
 {
    numberOfThreads = omp_get_num_threads();
    #pragma omp parallel for private(i)
    for (i = 0; i < n; i++)
    {
        double local_diff = x[i] - xseq[i];
        diff_vector[i] = local_diff;
        l2_norm += (local_diff * local_diff);
    }
 }

경쟁 조건, 즉 변수에 대한 액세스가 있습니다 l2_norm. 또한 병렬화 된 루프 private(i)의 인덱스 변수 ( 예 :)가 OpenMP에 의해 i암시 적으로 비공개 로 설정되므로을 삭제할 수 있습니다 . 경쟁 조건은 OpenMP 감소 로 수정할 수 있습니다 . 또한 루프는 원하는대로 스레드간에 반복을 실제로 배포하지 않습니다. 그에 parallel 절을 다시 추가 #pragma omp for하고 중첩 된 병렬 처리를 비활성화했다고 가정 하기 때문에 기본적으로 외부에서 생성 된 각 스레드 parallel region는 해당 영역 내에서 코드를 "순차적으로" 실행합니다 .

    #pragma omp parallel for private(i)
    for (i = 0; i < n; i++)
    {
        double local_diff = x[i] - xseq[i];
        diff_vector[i] = local_diff;
        l2_norm += (local_diff * local_diff);
    }

따라서 각 스레드는 N병렬화하려는 루프의 모든 반복을 실행합니다 . 결과적으로 병렬 처리를 제거 하고 순차 코드에 추가 오버 헤드 ( 예 : 스레드 생성)를 추가합니다. 이러한 문제 ( 예 : 경합 상태 및 "중첩 된" 병렬 영역)를 수정하려면이 코드를 다음과 같이 변경하십시오.

 #pragma omp parallel
 {
    numberOfThreads = omp_get_num_threads();
    #pragma omp for reduction(+:l2_norm)
    for (i = 0; i < n; i++)
    {
        double local_diff = x[i] - xseq[i];
        diff_vector[i] = local_diff;
        l2_norm += (local_diff * local_diff);
    }
 }

이제, 즉 병렬 루프의 하이브리드 병렬화의 컨텍스트에서 수행되고 있음을 또 다른 문제 (성능 현명한), 여전히 남아있는 이러한 문제를 해결하는 데 OpenMP + MPI, 그리고 당신은 명시 적으로하지 않았다 바인드 OpenMP합니다 (내 스레드 MPI프로세스)에 해당 코어. 명시적인 바인딩이 없으면 해당 스레드가 어떤 코어에서 끝날지 확신 할 수 없습니다. 당연히 동일한 논리 코어 에서 여러 스레드를 실행하면 병렬화되는 애플리케이션의 전체 실행이 증가하는 경우가 많습니다.

애플리케이션에서 스레드를 사용하는 경우, --bind-to none을 지정하여 전혀 바인딩되지 않았는지 확인하거나 적절한 바인딩 수준 또는 애플리케이션 당 특정 수의 처리 요소를 사용하여 여러 코어에 바인딩되었는지 확인하고 싶을 것입니다. 방법. 다음 방법 중 하나로이 문제를 해결할 수 있습니다.

--bind-to none스레드를 다른 코어에 할당 할 수 있도록 MPI 플래그로 바인딩을 비활성화합니다 .
또는 그에 따라 스레드의 경계를 수행하십시오. .NET과 같은 하이브리드 병렬화에서 스레드를 코어에 매핑하는 방법에 대해이 SO 스레드 를 확인하십시오 MPI + OpenMP.

그에 따라 프로세스 당 스레드 수를 명시 적으로 설정하면 여러 스레드가 동일한 코어에서 끝나는 것을 방지하고 결과적으로 동일한 코어 내의 스레드 가 동일한 리소스 를두고 싸우는 것을 방지 할 수 있습니다.

조언:

IMO는 먼저 OpenMPMPI 프로세스없이 단독 성능을 테스트해야 합니다. 이 컨텍스트에서 2스레드에 대한 순차 버전을 측정 한 다음 4, 8등 을 측정하고 점차 스레드 수를 늘려 코드의 확장 성을 테스트합니다 . 결국 코드가 단순히 확장을 중지하는 스레드가 많이 있습니다. 당연히 스레드가 수행하는 병렬 작업의 양은 병렬 처리의 오버 헤드를 극복 할 수있을만큼 커야합니다. 따라서 더 크고 더 큰 입력으로 주위를 테스트해야합니다.

프로파일 링하고 개선 된 OpenMP버전을 테스트 한 후 .NET을 사용하여 여러 프로세스로 공유 메모리 병렬화를 확장 할 수 있습니다 MPI.

1 HristoIliev Dec 17 2020 at 19:49

@dreamcrash의 답변에 명시된 공유 변수 업데이트의 경쟁 조건 외에도 코드가 작업을 제대로 배포하지 않습니다.

#pragma omp parallel
{
    numberOfThreads = omp_get_num_threads();
    #pragma omp parallel for private(i)
                ~~~~~~~~
    for (i = 0; i < n; i++)
    {
        double local_diff = x[i] - xseq[i];
        diff_vector[i] = local_diff;
        l2_norm += (local_diff * local_diff);
    }
}

parallel내부 루프 의 구성은 중첩 된 결합 병렬 for구성을 만듭니다. 이는 외부 병렬 루프를 실행하는 팀의 각 스레드가 완전히 새로운 병렬 영역을 생성하고 그 안에있는 스레드에 루프를 배포한다는 것을 i의미합니다. 없다 외부 병렬 지역에서 일어나는 어떤 분포 와 함께 당신은 결국 N은 모두 동일한 작업을 반복 스레드. 기본적으로 중첩 된 병렬 처리는 비활성화되어 있으므로 중첩 된 병렬 영역은 순차적으로 실행되며 코드는 다음을 효과적으로 수행합니다.

#pragma omp parallel
{
    numberOfThreads = omp_get_num_threads();
    for (i = 0; i < n; i++)
    {
        double local_diff = x[i] - xseq[i];
        diff_vector[i] = local_diff;
        l2_norm += (local_diff * local_diff);
    }
}

작업 분배가 없으며 모든 스레드가 diff_vector[]어레이 의 동일한 위치에 씁니다.

한편으로,이 코드는 일반적으로 데이터 바이트 당 계산량이 적기 때문에 메모리 바운드 코드입니다. 최신 CPU는주기 당 많은 곱셈과 뺄셈을 수행 할 수 있지만 메모리에서 데이터를 가져오고 결과를 다시 기록하는 데 많은주기가 걸립니다. 제한 요소가 메모리 대역폭이기 때문에 메모리 바인딩 문제는 더 많은 스레드로 더 빨리 진행되지 않습니다. 32K 어레이 항목이 256KB의 메모리를 차지하고 대부분의 CPU 캐시에 맞고 L3 캐시가 엄청나게 빠르지 만 여전히 단일의 가장 빠른 L1 캐시보다 크기 때문에 이것은 큰 문제가 아닙니다. CPU 코어. 반면에 여러 스레드에서 동일한 메모리 영역에 쓰면 스레드 간 캐시 무효화와 함께 true 및 false 공유가 발생하며 일반적으로 병렬 코드가 순차 버전보다 느리게 실행됩니다.

코드 성능을 분석하고 문제를 발견하는 데 도움이되는 도구가 있습니다. 이미 의견에서 썼 듯이 Intel VTune은 그중 하나이며 oneAPI 툴킷의 일부로 무료로 사용할 수 있습니다. Intel Inspector는 또 다른 하나이며 (다시 말하지만, 무료이며 oneAPI 툴킷의 일부) 데이터 경쟁과 같은 문제를 찾습니다. 두 도구는 함께 잘 작동하며 야심 찬 병렬 프로그래머에게 충분히 강력하게 권장 할 수 없습니다.

에 쓰는 사소한 경쟁 조건도 numberOfThreads있지만 기록 된 모든 값이 동일하기 때문에 논리적 문제는 아닙니다. 문제가되는 코드의 올바른 버전은 다음과 같아야합니다.