module pop_c
  #(int width = 128, int bits = $clog2(width+1))
   (output logic [bits:1] pop, input [width-1:0] vector);

   always_comb begin

      pop = 0;
      for ( int i=0; i<width; i++ ) pop += vector[i];

   end

endmodule


module pop_2bit
  ( output [1:0] pop, input wire [1:0] vector );

   assign pop[0] = vector[0] ^ vector[1];   // Note: Exclusive or.
   assign pop[1] = vector[0] & vector[1];
   
endmodule

module some_adder
  #( int width = 32 )
   ( output wire [width-1:0] sum,
     input wire [width-1:0] a, b );
   assign  sum = a + b;
endmodule

`define XXX 1
`ifdef XXX
module pop_n
  #( int width = 128, int bits = $clog2(width+1) )
   ( output wire [bits-1:0] pop,
     input wire [width-1:0] vector );

   wire [bits-1:0] inter_mod[width/2:-1];
   localparam    logic [bits-1:2] some_zeros = 0;

   assign        inter_mod[-1] = 0;

   for ( genvar i=0; i<width/2; i++ ) begin

      wire [1:0] x;

      pop_2bit p2(x, vector[2*i+1:2*i] );
      some_adder #(bits) a1( inter_mod[i], {some_zeros, x}, inter_mod[i-1] );

   end

   assign        pop = inter_mod[width/2-1];

endmodule

module pop_tree
  #( int width = 128, int bits = $clog2(width+1) )
   ( output wire [bits-1:0] pop,
     input wire [width-1:0] vector );

   if ( width == 1 ) begin

      assign pop = vector;

   end else begin

      localparam int lwid = width/2;
      localparam int rwid = width - lwid;

      wire [ $clog2(lwid+1)-1: 0 ] pl;
      wire [ $clog2(rwid+1)-1: 0 ] pr;

      pop_tree #(lwid) pleft( pl, vector[width-1:rwid] );
      pop_tree #(rwid) pright( pr, vector[rwid-1:0] );

      assign                       pop = pl + pr;

   end
   
endmodule  



`else

module pop_n
  #( int width = 128, int bits = $clog2(width+1) )
   ( output wire [bits-1:0] pop,
     input wire [width-1:0] vector );

   wire [bits-1:0] inter_mod[width/2:-1];
   localparam      logic [bits-1:2] fill = 0;

   assign          inter_mod[-1] = 0;

   for ( genvar i=0; i<width; i += 2 ) begin
      wire [1:0] x;
      pop_2bit p2(x, vector[i+1:i] );
      some_adder #(bits) a1( inter_mod[i/2], {fill, x}, inter_mod[i/2-1] );
   end

   assign        pop = inter_mod[width/2-1];

endmodule


module pop_tree
  #( int width = 128, int bits = $clog2(width+1) )
   ( output wire [bits-1:0] pop,
     input wire [width-1:0] vector );

   if ( width == 1 ) begin

      assign pop = vector;

   end else begin

      localparam int lwid = width/2;
      localparam int rwid = width - lwid;
      localparam int lbits = $clog2(lwid+1);
      localparam int rbits = $clog2(rwid+1);
      wire [lbits-1:0] lpop;
      wire [rbits-1:0] rpop;

      pop_tree #(lwid) ptl( lpop, vector[width-1:rwid] );
      pop_tree #(rwid) ptr( rpop, vector[rwid-1:0] );
      assign           pop = lpop + rpop;

   end

endmodule  

`endif

module pop_s3
  #( int width = 128, 
     int chunk = 2,
     int bits = $clog2(width+1) )
   ( output logic [bits:1] pop,
     input [width-1:0] vector, 
     input clock);

   // cadence translate_off
   initial begin
      if ( width % chunk ) begin
         $display("Chunk size must be a multiple of width.\n");
         $fatal(1);
      end
   end
   // cadence translate_on

   logic [$clog2(width):1] pos;
   localparam int rounds = (width+chunk-1)/chunk;
   logic [$clog2(rounds):1] round;
   logic [bits:1] so_far;

   localparam int chunk_bits = $clog2(chunk+1);
   wire [chunk_bits:1] pop_chunk;
   wire [chunk-1:0]    vector_part = vector[ round*chunk +: chunk ];

   // Instantiate a separate population counter.
   pop_c #(chunk) pc(pop_chunk,vector_part);

   // cadence translate_off
   initial begin pos = 0; round = 0; end
   // cadence translate_on
   
   always @( posedge clock ) begin

      // Pop for current value of pos.
      so_far += pop_chunk;

      if ( round == rounds - 1 ) begin
        
         pos = 0;
         round = 0;
         pop = so_far;
         so_far = 0;
         
      end
      else
        round++;

   end

endmodule


// cadence translate_off

module test_pop();

   localparam int width = 128;
   localparam int bits = $clog2(width+1);
   localparam int numtests = 1000;

   typedef struct { logic [bits-1:0] pop; int err_count = 0; } Info;
   Info pi[string];

   string names[] = '{ "Testbench", "Combinational",
                       "Linear elaboration.",
                       "Tree elaboration.",
                       "Sequential mod n-bits"
                       };
   
   int num_units = names.size;
   localparam int max_units = 10;
   localparam int err_limit = 4;

   logic [width-1:0] vector;
   logic [bits-1:0] pop[max_units];
   logic clock;

   initial clock = 0;
   always #1 clock = !clock;

   pop_c  #(width) p1( pop[1], vector);
   pop_n #(width) p2( pop[2], vector);
   pop_tree #(width,bits) p3( pop[3], vector);
   pop_s3 #(width,bits) p4( pop[4], vector, clock);

   int err_count[max_units];

   initial begin

      for ( int i=0; i<numtests; i++ ) begin

         for ( int j=0; j<width; j+=32 )
           vector[j +: 32 ] = $random();

         pop[0] = $countbits( vector, 1 );

         #1000;

         for ( int mut=1; mut<num_units; mut++ ) begin

            if ( pop[0] !== pop[mut] ) begin
               err_count[mut]++;
               if ( err_count[mut] < err_limit )
                 $display("Wrong answer in %s: %d != %d (correct)\n",
                          names[mut],
                          pop[mut], pop[0]);

            end

         end

      end

      for ( int mut=1; mut<num_units; mut++ )
        $display("Mut %s had %d errors, %.2f%%\n",
                 names[mut], 
                 err_count[mut], 100.0 * err_count[mut]/numtests);

      $finish(2);

   end

endmodule

// cadence translate_on


`ifdef xxx
Synthesizing with args "-to_mapped -effort high"  128 bits

Module Name                             Area   Delay   Delay
                                              Actual  Target

pop_c                                 271414    3246     100
pop_n                                 396401    3296     100
pop_tree                              337278    3480     100
pop_c                                 300928    3192    3000
pop_n                                 390596    3280    3000
pop_tree                              351155    3302    3000
pop_c                                 240371    3499    3500
pop_n                                 297622    3500    3500
pop_tree                              276932    3500    3500
pop_c                                  95937    3988    4000
pop_n                                 158029    3997    4000
pop_tree                              202811    3995    4000
pop_c                                  92284    4749    5000
pop_n                                 123406    4737    5000
pop_tree                              200568    4998    5000
pop_c                                  89860    4699  100000
pop_n                                 119922    5514  100000
pop_tree                              168896    5246  100000


  Synthesizing with args "-to_mapped -effort high" and params " 32 ".

Module Name                             Area   Delay   Delay
                                              Actual  Target

pop_c_width32                          29069    2468     100
pop_n_width32                          79872    2040     100
pop_tree_width32                       87727    1874     100
pop_c_width32                          24660    2775    3000
pop_n_width32                          32970    2766    3000
pop_tree_width32                       39937    2828    3000
pop_c_width32                          21467    3325    3500
pop_n_width32                          30621    3396    3500
pop_tree_width32                       39307    2940    3500
pop_c_width32                          21467    3307    4000
pop_n_width32                          29957    3305    4000
pop_tree_width32                       39488    3383    4000
pop_c_width32                          21467    3307    5000
pop_n_width32                          29957    3305    5000
pop_tree_width32                       39620    3383    5000
pop_c_width32                          21467    3307  100000
pop_n_width32                          28625    3459  100000
pop_tree_width32                       39318    3419  100000

`endif