Cell BE Tutorial

De LemmaWiki

A step-by-step tutorial on Cell BE programming by Manoel T. F. Cunha, D.Sc.

Conteúdo

HELLO WORLD

SPE Runtime Management Library Version 1 to Version 2 Migration Guide

Hello World
hello_ppu.c ( non-threaded )

#include<stdio.h>
#include<libspe2.h>
extern spe_program_handle_t hello_spu;
int main(void) {
  spe_context_ptr_t context;
  unsigned int entry = SPE_DEFAULT_ENTRY;
  // initialize context data structure
  context = spe_context_create(0,NULL);
  if (context == NULL) printf("context_create error\n");
  // load executable into the SPE's local store
  if (spe_program_load(context,&hello_spu))
    printf("program_load error\n");
  if (spe_context_run(context,&entry,0,NULL,NULL,NULL) < 0)
    printf("context_run error\n");
  // release associated resources and free memory
  spe_context_destroy(context);
  printf ("Hello world ! PPU\n");
  return 0;
}

hello_ppu.c ( single-threaded )

#include<stdio.h>
#include<libspe2.h>
#include<pthread.h>
// A secondary function must be defined
// which is passed to the pthread_create function.
// The secondary function should run the SPU context.
void *ppu_pthread_function(void *arg) {
  spe_context_ptr_t context = *(spe_context_ptr_t *) arg;
  unsigned int entry = SPE_DEFAULT_ENTRY;
  spe_context_run(context,&entry,0,NULL,NULL,NULL);
  pthread_exit(NULL);
}
extern spe_program_handle_t hello_spu;
int main(void) {
  spe_context_ptr_t context;
  pthread_t pthread;
  context = spe_context_create(0,NULL);
  spe_program_load(context,&hello_spu);
  pthread_create(&pthread,NULL,&ppu_pthread_function,&context);
  pthread_join(pthread,NULL);
  spe_context_destroy(context);
  printf ("Hello world ! PPU\n");
  return 0;
}

hello_spu.c

#include<stdio.h>
int main(unsigned long long speid) {
  printf ("Hello world ! SPU\n");
  return 0;
}

Makefiles
export CELL_TOP=/opt/cell/sdk
PPU Makefile

DIRS := spu
PROGRAM_ppu := hello_ppu
IMPORTS := spu/hello_spu.a -lspe2 -lpthread
include $(CELL_TOP)/buildutils/make.footer

SPU Makefile

PROGRAM_spu := hello_spu
LIBRARY_embed := hello_spu.a
include $(CELL_TOP)/buildutils/make.footer

Context
A data structure that contains all the persistent information about an SPE.

spe_context_ptr_t spe_context_create (unsigned int flags, spe_gang_context_ptr_t gang)

  • flags : a bit-wise OR of modifiers.
  • gang : group of contexts.


int spe_program_load (spe_context_ptr context, spe_program_handle_t *program)

  • context : SPU identifier.
  • program : program to be loaded into the LS.


int spe_context_run (spe_context_ptr_t context, unsigned int *entry, unsigned int flags, void *argp, void *envp, spe_stop_info_t *stopinfo)

  • context : SPU identifier.
  • entry : program' starting instruction pointer.
  • flags : a bit-wise OR of modifiers.
  • argp : optional pointer to application specific data. It is passed as the second parameter of the SPU program.
  • envp : optional pointer to environment specific data. It is passed as the third parameter of the SPU program.
  • stopinfo : optional pointer to a structure that provides information about the termination condition.
read more on Programming Tutorial ...


MAILBOXES

Mailboxes
mbox_ppu.c

#include <stdio.h>
#include <libspe2.h>
#include <pthread.h>
void *ppu_pthread_function(void *arg) {
  spe_context_ptr_t context = *(spe_context_ptr_t *) arg;
  unsigned int entry = SPE_DEFAULT_ENTRY;
  spe_context_run(context,&entry,0,NULL,NULL,NULL);
  pthread_exit(NULL);
}
extern spe_program_handle_t mbox_spu;
int main(void) {
  spe_context_ptr_t context[8];
  pthread_t pthread[8];
  unsigned int i;
  for (i=0;i<8;i++) {
    context[i] = spe_context_create(0,NULL);
    spe_program_load(context[i],&mbox_spu);
    pthread_create(&pthread[i],NULL,&ppu_pthread_function,&context[i]);
  }
  // write SPU ids using mailboxes
  for (i=0;i<8;i++)
    if (spe_in_mbox_write(context[i],&i,1,SPE_MBOX_ANY_NONBLOCKING) == 0)
      printf("Message could not be written\n");
  for (i=0;i<8;i++) {
    pthread_join(pthread[i],NULL);
    spe_context_destroy(context[i]);
  }
  printf ("End of PPU thread\n");
  return 0;
}

mbox_spu.c

#include<stdio.h>
#include<spu_mfcio.h>
int main(unsigned long long speid) {
  // read SPU id using mailbox
  unsigned int spu_id = spu_read_in_mbox();
  printf ("Hello world ! SPU %i\n",spu_id);
  return 0;
}

Mailboxes
  • To exchange 32 bits short messages
    • storage addresses
    • control communication
      • program status
      • completion flags
  • An SPE has one mailbox for receiving messages
    • SPU Read Inbound Mailbox
      • 4 deep
      • can be overwritten (message can be lost)
      • SPU stalls on reading empty mailbox
  • An SPE has two mailboxes for sending messages
    • SPU Write Outbound Mailbox
      • 1 deep
      • SPU stalls writing to full mailbox
    • SPU Write Outbound Interrupt Mailbox

int spe_in_mbox_write (spe_context_ptr_t context, unsigned int *data, int count, unsigned int behavior)

  • context : SPU identifier.
  • data : pointer to an array of unsigned integers containing the messages to be written.
  • count : maximum number of messages to be written.
  • behaviour : specifies whether the call should block until messages are written.
read more on SPE Runtime Management Library ...


DMA

DMA
dma_ppu.c

#include <stdio.h>
#include <string.h>
#include <libspe2.h>
#include <pthread.h>
typedef struct ppu_pthread_data {
  spe_context_ptr_t context;
  unsigned int entry;
  void *argp;
  void *envp;
} ppu_pthread_data_t;
void *ppu_pthread_function(void *arg) {
  ppu_pthread_data_t *data = (ppu_pthread_data_t *) arg;
  spe_context_run(data->context,&data->entry,0,data->argp,data->envp,NULL);
  pthread_exit(NULL);
}
extern spe_program_handle_t dma_spu;
int main(void) {
  char buffer[128] __attribute__((aligned(128)));
  strcpy(buffer,"Hello world !");
  ppu_pthread_data_t ptdata[8];
  pthread_t pthread[8];
  unsigned int i;
  for (i=0;i<8;i++) {
    ptdata[i].context = spe_context_create(0,NULL);
    spe_program_load(ptdata[i].context,&dma_spu);
    ptdata[i].entry = SPE_DEFAULT_ENTRY;
    ptdata[i].argp = (void *) buffer;
    ptdata[i].envp = (void *) 128;
    pthread_create(&pthread[i],NULL,&ppu_pthread_function,&ptdata[i]);
    spe_in_mbox_write(ptdata[i].context,&i,1,SPE_MBOX_ANY_NONBLOCKING);
  }
  for (i=0;i<8;i++) {
    pthread_join(pthread[i],NULL);
    spe_context_destroy(ptdata[i].context);
  }
  printf ("Bonjour monde ! PPU\n");
  return 0;
}

dma_spu.c

#include<stdio.h>
#include<spu_mfcio.h>
int main(unsigned long long speid,unsigned long long argp,unsigned long long envp) {
  char buffer[128] __attribute__((aligned(128)));
  unsigned int spu_id = spu_read_in_mbox();
  // transfer data from PPU using DMA
  int tag = 1, tag_mask = 1<<tag;
  mfc_get(buffer,(unsigned int) argp,envp,tag,0,0);
  mfc_write_tag_mask(tag_mask);
  mfc_read_tag_status_all();
  printf ("%s SPU %i\n",buffer,spu_id);
  return 0;
}

DMA Transfers
  • Asynchronous data and instructions transfers between main memory and LS.
  • Naturally aligned transfer sizes of 1, 2, 4, or 8 bytes and multiples of 16 bytes.
  • Maximum transfer size of 16 KB.
  • Peak performance is achieved when both the EA and LSA are 128-byte aligned and the size of the transfer is a multiple of 128 bytes.
  • Transfer direction is always referenced from the perspective of an SPE :
    • mfc_get : transfer data into an SPE
      (from main memory to LS)
    • mfc_put : transfer data out of an SPE
      (from LS to main memory)

DMA Tags
Each DMA command is identified by a 5-bit tag.
  • Used for checking status or waiting on the completion of DMA commands.
  • Same tag can be used for multiple commands.
  • Tagging is optional but can be used to synchronize DMA commands : fence and barrier.
  • Commands with same tag value form a tag group
Tag Masks
A 32-bit word used to identify tag groups.
  • Each bit in the tag mask corresponds to a specific tag identifier : tag_mask = 1 << tag_id.
    • tag status is logically ANDed with current tag mask.
    • tag status bit of '1' indicates that no DMA requests tagged with the specific tag identifier (corresponding to the status bit location) are still either in progress or in the DMA queue.
  • Tag mask remains set until changed.

(void) mfc_get (volatile void *lsa, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)

  • lsa : local store address
  • ea : effective address in main memory
  • size : DMA transfer size
  • tag : DMA tag identifier
  • tid : transfer class identifier
  • rid : replacement class identifier


(void) mfc_read_tag_status_all (void)

  • wait until all of the specified tagged DMA commands are completed.
DMA 2
dma2_ppu.c

#include<stdio.h>
#include<libspe2.h>
#include<pthread.h>
typedef struct ppu_pthread_data {
  spe_context_ptr_t context;
  unsigned int entry;
  void *argp;
  void *envp;
} ppu_pthread_data_t;
void *ppu_pthread_function(void *arg) {
  ppu_pthread_data_t *data = (ppu_pthread_data_t *) arg;
  spe_context_run(data->context,&data->entry,0,data->argp,data->envp,NULL);
  pthread_exit(NULL);
}
typedef struct {
  int NN;
  float *X;
  char dummy[120];
} MYSTRUCT;
extern spe_program_handle_t dma2_spu;
int main(void) {
  float X[32] __attribute__((aligned(128)));
  unsigned int i;
  for (i=0;i<32;i++) X[i] = i * 2;
  MYSTRUCT mystruct __attribute__((aligned(128)));
  mystruct.NN = 32;
  mystruct.X = X;
  ppu_pthread_data_t ptdata[8];
  pthread_t pthread[8];
  for (i=0;i<8;i++) {
    ptdata[i].context = spe_context_create(0,NULL);
    spe_program_load(ptdata[i].context,&dma2_spu);
    ptdata[i].entry = SPE_DEFAULT_ENTRY;
    ptdata[i].argp = (void *) &mystruct;
    ptdata[i].envp = (void *) 128;
    pthread_create(&pthread[i],NULL,&ppu_pthread_function,&ptdata[i]);
    spe_in_mbox_write(ptdata[i].context,&i,1,SPE_MBOX_ANY_NONBLOCKING);
  }
  for (i=0;i<8;i++) {
    pthread_join(pthread[i],NULL);
    spe_context_destroy(ptdata[i].context);
  }
  printf ("Bonjour monde ! PPU\n");
  return 0;
}

dma2_spu.c

#include<stdio.h>
#include<spu_mfcio.h>
typedef struct {
  int NN;
  float *X;
  char dummy[120];
} MYSTRUCT;
int main(unsigned long long speid,unsigned long long argp,unsigned long long envp) {
  MYSTRUCT mystruct __attribute__((aligned(128)));
  float X[32] __attribute__((aligned(128)));
  unsigned int spu_id = spu_read_in_mbox();
  int tag = 1, tag_mask = 1<<tag;
  mfc_get(&mystruct,(unsigned int) argp,envp,tag,0,0);
  mfc_write_tag_mask(tag_mask);
  mfc_read_tag_status_all();
  // transfer data from PPU using DMA
  mfc_get(X,(unsigned long int) mystruct.X,128,tag,0,0);
  mfc_read_tag_status_all();
  printf ("Hello world ! %i %5.1f\n",spu_id,X[spu_id]);
  return 0;
}

DMA
Coherent DMA transfers 
A pointer to a data structure created on the PPE can be passed to the SPU. The SPU can use this pointer to issue a DMA command to bring the data structure into its LS


DOT PRODUCT

Dot Product
dot_ppu.c

#include <stdio.h>
#include <libspe2.h>
#include <pthread.h>
typedef struct ppu_pthread_data {
  spe_context_ptr_t context;
  unsigned int entry;
  void *argp;
  void *envp;
} ppu_pthread_data_t;
void *ppu_pthread_function(void *arg) {
  ppu_pthread_data_t *data = (ppu_pthread_data_t *) arg;
  spe_context_run(data->context,&data->entry,0,data->argp,data->envp,NULL);
  pthread_exit(NULL);
}
typedef struct {
  float *X;
  float *Y;
  float *Z;
  char dummy[116];
} MYSTRUCT;
extern spe_program_handle_t dot_spu;
int main(void) {
  float X[256] __attribute__((aligned(128)));
  float Y[256] __attribute__((aligned(128)));
  float Z[256] __attribute__((aligned(128)));
  unsigned int i;
  for (i=0;i<256;i++) { X[i] = i * 2; Y[i] = i * 2 + 1; }
  MYSTRUCT mystruct __attribute__((aligned(128)));
  mystruct.X = X;
  mystruct.Y = Y;
  mystruct.Z = Z;
  ppu_pthread_data_t ptdata[8];
  pthread_t pthread[8];
  for (i=0;i<8;i++) {
    ptdata[i].context = spe_context_create(0,NULL);
    spe_program_load(ptdata[i].context,&dma_spu);
    ptdata[i].entry = SPE_DEFAULT_ENTRY;
    ptdata[i].argp = (void *) &mystruct;
    ptdata[i].envp = (void *) 128;
    pthread_create(&pthread[i],NULL,&ppu_pthread_function,&ptdata[i]);
    spe_in_mbox_write(ptdata[i].context,&i,1,SPE_MBOX_ANY_NONBLOCKING);
  }
  for (i=0;i<8;i++) {
    pthread_join(pthread[i],NULL);
    spe_context_destroy(ptdata[i].context);
  }
  // compute dot product
  float sum = 0.;
  for (i=0;i<256;i++) sum += Z[i];
  printf("Dot product ! %10.1f\n",sum);
  return 0;
}

dot_spu.c

#include<stdio.h>
#include<spu_mfcio.h>
typedef struct {
  float *X;
  float *Y;
  float *Z;
  char dummy[116];
} MYSTRUCT;
int main(unsigned long long speid,unsigned long long argp,unsigned long long envp) {
  MYSTRUCT mystruct __attribute__((aligned(128)));
  float X[32] __attribute__((aligned(128)));
  float Y[32] __attribute__((aligned(128)));
  float Z[32] __attribute__((aligned(128)));
  unsigned int spu_id = spu_read_in_mbox();
  int tag = 1, tag_mask = 1<<tag;
  mfc_get(&mystruct,(unsigned int) argp,envp,tag,0,0);
  mfc_write_tag_mask(tag_mask);
  mfc_read_tag_status_all();
  unsigned int offset = spu_id * 128;
  mfc_get(X,(unsigned long int) mystruct.X+offset,128,tag,0,0);
  mfc_get(Y,(unsigned long int) mystruct.Y+offset,128,tag,0,0);
  mfc_read_tag_status_all();
  // compute vector Z = X * Y
  int i;
  for (i=0;i<32;i++) Z[i] = X[i] * Y[i];
  // transfer data to PPU using DMA
  mfc_put(Z,(unsigned long int) mystruct.Z+offset,128,tag,0,0);
  mfc_read_tag_status_all();
  printf("End of SPU %i thread\n",spu_id);
  return 0;
}

Dot Product
The dot product of two vectors :
  • a = [a1, a2, … , an]
  • b = [b1, b2, … , bn]

is by definition :
a \cdot b = \sum_{i=1}^n a_i b_i =
a_1 b_1 + a_2 b_2 + \cdots + a_n b_n

read more on Wikipedia ...
DMA
(void) mfc_put (volatile void *lsa, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
  • lsa : local store address
  • ea : effective address in main memory
  • size : DMA transfer size
  • tag : DMA tag identifier
  • tid : transfer class identifier
  • rid : replacement class identifier
Dot Product 2
dot2_ppu.c

##include <stdio.h>
#include <libspe2.h>
#include <pthread.h>
typedef struct ppu_pthread_data {
  spe_context_ptr_t context;
  unsigned int entry;
  void *argp;
  void *envp;
} ppu_pthread_data_t;
void *ppu_pthread_function(void *arg) {
  ppu_pthread_data_t *data = (ppu_pthread_data_t *) arg;
  spe_context_run(data->context,&data->entry,0,data->argp,data->envp,NULL);
  pthread_exit(NULL);
}
typedef struct {
  float *X;
  float *Y;
  char dummy[120];
} MYSTRUCT;
extern spe_program_handle_t dot2_spu;
int main(void) {
  float X[256] __attribute__((aligned(128)));
  float Y[256] __attribute__((aligned(128)));
  unsigned int i;
  for (i=0;i<256;i++) { X[i] = i * 2; Y[i] = i * 2 + 1; }
  MYSTRUCT mystruct __attribute__((aligned(128)));
  mystruct.X = X;
  mystruct.Y = Y;
  ppu_pthread_data_t ptdata[8];
  pthread_t pthread[8];
  for (i=0;i<8;i++) {
    ptdata[i].context = spe_context_create(0,NULL);
    spe_program_load(ptdata[i].context,&dot2_spu);
    ptdata[i].entry = SPE_DEFAULT_ENTRY;
    ptdata[i].argp = (void *) &mystruct;
    ptdata[i].envp = (void *) 128;
    pthread_create(&pthread[i],NULL,&ppu_pthread_function,&ptdata[i]);
    spe_in_mbox_write(ptdata[i].context,&i,1,SPE_MBOX_ANY_NONBLOCKING);
  }
  // read partials sums using mailboxes
  unsigned int psum; float sum = 0.;
  for (i=0;i<8;i++) {
    while (spe_out_mbox_status(ptdata[i].context) == 0) {};
    spe_out_mbox_read(ptdata[i].context,&psum,1);
    sum += (float) psum;
  }
  for (i=0;i<8;i++) {
    pthread_join(pthread[i],NULL);
    spe_context_destroy(ppdata[i].context);
  }
  printf ("Dot product : %10.1f\n",sum);
  return 0;
}

dot2_spu.c

#include<stdio.h>
#include<spu_mfcio.h>
typedef struct {
  float *X;
  float *Y;
  char dummy[120];
} MYSTRUCT;
int main(unsigned long long speid,unsigned long long argp,unsigned long long envp) {
  MYSTRUCT mystruct __attribute__((aligned(128)));
  float X[32] __attribute__((aligned(128)));
  float Y[32] __attribute__((aligned(128)));
  unsigned int spu_id = spu_read_in_mbox();
  // transfer structure from PPU using DMA
  int tag = 1, tag_mask = 1<<tag;
  mfc_get(&mystruct,(unsigned int) argp,envp,tag,0,0);
  mfc_write_tag_mask(tag_mask);
  mfc_read_tag_status_all();
  // transfer data from PPU using DMA
  unsigned int offset = spu_id * 128;
  mfc_get(X,(unsigned long int) mystruct.X+offset,128,tag,0,0);
  mfc_get(Y,(unsigned long int) mystruct.Y+offset,128,tag,0,0);
  mfc_read_tag_status_all();
  // compute partial sum
  int i; float sum = 0.;
  for (i=0;i<32;i++) sum += X[i] * Y[i];
  // write partial sum using mailbox
  spu_write_out_mbox((unsigned int) sum);
  printf("End of SPU %i thread\n",spu_id);
  return 0;
}

Mailboxes
  • PPU must check Mailbox Status Register to determine that unread data is available in the SPU Outbound Mailbox or SPU Outbound Interrupt Mailbox otherwise stale or undefined data may be returned.
  • PPE should not read but poll the Mailbox Status register

int spe_out_mbox_status (spe_context_ptr_t context)

  • context : SPU identifier.
  • return value = 0 : the mailbox is empty


int spe_out_mbox_read (spe_context_ptr_t context, unsigned int *data, int count)

  • context : SPU identifier.
  • data : pointer to an array of unsigned integers to store the messages to be read.
  • count : maximum number of messages to be read.
read more on SPE Runtime Management Library ...
Dot Product 3
dot3_ppu.c
same as dot2_ppu.c
dot3_spu.c

#include<stdio.h>
#include<spu_mfcio.h>
typedef struct {
  float *X;
  float *Y;
  char dummy[120];
} MYSTRUCT;
typedef union {
  float val[4];
  vector float vec;
} VECVAR;
int main(unsigned long long speid,unsigned long long argp,unsigned long long envp) {
  float X[32] __attribute__((aligned(128)));
  float Y[32] __attribute__((aligned(128)));
  vector float *vecX = (vector float *) X;
  vector float *vecY = (vector float *) Y;
  vector float vecA = (vector float) {0.,0.,0.,0.};
  VECVAR vecB;
  MYSTRUCT mystruct __attribute__((aligned(128)));
  unsigned int spu_id = spu_read_in_mbox();
  int tag = 1, tag_mask = 1<<tag;
  mfc_get(&mystruct,(unsigned int) argp,envp,tag,0,0);
  mfc_write_tag_mask(tag_mask);
  mfc_read_tag_status_all();
  unsigned int offset = spu_id * 128;
  mfc_get(X,(unsigned long int) mystruct.X+offset,128,tag,0,0);
  mfc_read_tag_status_all();
  mfc_get(Y,(unsigned long int) mystruct.Y+offset,128,tag,0,0);
  mfc_read_tag_status_all();
  int i;
  for (i=0;i<8;i++) {
    vecB.vec = spu_madd(vecX[i],vecY[i],vecA);
    vecA = vecB.vec;
  }
  float sum = vecB.val[0] + vecB.val[1] + vecB.val[2] + vecB.val[3];
  spu_write_out_mbox((unsigned int) sum);
  printf("End of SPU %i thread\n",spu_id);
  return 0;
}



Day 1 - 01 Day 1 - 02 Day 1 - 03 Day 1 - 04 Day 1 - 05 Day 1 - 06 Day 1 - 07 Day 1 - 08
Day 1 - 09 Day 1 - 10 Day 1 - 11 Day 1 - 12 Day 1 - 13 Day 2 - 01 Day 2 - 02 Day 2 - 03
Day 2 - 04 Day 2 - 05 Day 2 - 06 Day 2 - 07 Day 2 - 08 Day 2 - 09 Day 2 - 10


Back to Manoel Cunha
Ferramentas pessoais