module pop_c #(int width = 128, int bits = $clog2(width+1)) (output logic [bits:1] pop, input [width-1:0] vector); always_comb begin pop = 0; for ( int i=0; i<width; i++ ) pop += vector[i]; end endmodule module pop_2bit ( output [1:0] pop, input wire [1:0] vector ); assign pop[0] = vector[0] ^ vector[1]; // Note: Exclusive or. assign pop[1] = vector[0] & vector[1]; endmodule module some_adder #( int width = 32 ) ( output wire [width-1:0] sum, input wire [width-1:0] a, b ); assign sum = a + b; endmodule `define XXX 1 `ifdef XXX module pop_n #( int width = 128, int bits = $clog2(width+1) ) ( output wire [bits-1:0] pop, input wire [width-1:0] vector ); wire [bits-1:0] inter_mod[width/2:-1]; localparam logic [bits-1:2] some_zeros = 0; assign inter_mod[-1] = 0; for ( genvar i=0; i<width/2; i++ ) begin wire [1:0] x; pop_2bit p2(x, vector[2*i+1:2*i] ); some_adder #(bits) a1( inter_mod[i], {some_zeros, x}, inter_mod[i-1] ); end assign pop = inter_mod[width/2-1]; endmodule module pop_tree #( int width = 128, int bits = $clog2(width+1) ) ( output wire [bits-1:0] pop, input wire [width-1:0] vector ); if ( width == 1 ) begin assign pop = vector; end else begin localparam int lwid = width/2; localparam int rwid = width - lwid; wire [ $clog2(lwid+1)-1: 0 ] pl; wire [ $clog2(rwid+1)-1: 0 ] pr; pop_tree #(lwid) pleft( pl, vector[width-1:rwid] ); pop_tree #(rwid) pright( pr, vector[rwid-1:0] ); assign pop = pl + pr; end endmodule `else module pop_n #( int width = 128, int bits = $clog2(width+1) ) ( output wire [bits-1:0] pop, input wire [width-1:0] vector ); wire [bits-1:0] inter_mod[width/2:-1]; localparam logic [bits-1:2] fill = 0; assign inter_mod[-1] = 0; for ( genvar i=0; i<width; i += 2 ) begin wire [1:0] x; pop_2bit p2(x, vector[i+1:i] ); some_adder #(bits) a1( inter_mod[i/2], {fill, x}, inter_mod[i/2-1] ); end assign pop = inter_mod[width/2-1]; endmodule module pop_tree #( int width = 128, int bits = $clog2(width+1) ) ( output wire [bits-1:0] pop, input wire [width-1:0] vector ); if ( width == 1 ) begin assign pop = vector; end else begin localparam int lwid = width/2; localparam int rwid = width - lwid; localparam int lbits = $clog2(lwid+1); localparam int rbits = $clog2(rwid+1); wire [lbits-1:0] lpop; wire [rbits-1:0] rpop; pop_tree #(lwid) ptl( lpop, vector[width-1:rwid] ); pop_tree #(rwid) ptr( rpop, vector[rwid-1:0] ); assign pop = lpop + rpop; end endmodule `endif module pop_s3 #( int width = 128, int chunk = 2, int bits = $clog2(width+1) ) ( output logic [bits:1] pop, input [width-1:0] vector, input clock); // cadence translate_off initial begin if ( width % chunk ) begin $display("Chunk size must be a multiple of width.\n"); $fatal(1); end end // cadence translate_on logic [$clog2(width):1] pos; localparam int rounds = (width+chunk-1)/chunk; logic [$clog2(rounds):1] round; logic [bits:1] so_far; localparam int chunk_bits = $clog2(chunk+1); wire [chunk_bits:1] pop_chunk; wire [chunk-1:0] vector_part = vector[ round*chunk +: chunk ]; // Instantiate a separate population counter. pop_c #(chunk) pc(pop_chunk,vector_part); // cadence translate_off initial begin pos = 0; round = 0; end // cadence translate_on always @( posedge clock ) begin // Pop for current value of pos. so_far += pop_chunk; if ( round == rounds - 1 ) begin pos = 0; round = 0; pop = so_far; so_far = 0; end else round++; end endmodule // cadence translate_off module test_pop(); localparam int width = 128; localparam int bits = $clog2(width+1); localparam int numtests = 1000; typedef struct { logic [bits-1:0] pop; int err_count = 0; } Info; Info pi[string]; string names[] = '{ "Testbench", "Combinational", "Linear elaboration.", "Tree elaboration.", "Sequential mod n-bits" }; int num_units = names.size; localparam int max_units = 10; localparam int err_limit = 4; logic [width-1:0] vector; logic [bits-1:0] pop[max_units]; logic clock; initial clock = 0; always #1 clock = !clock; pop_c #(width) p1( pop[1], vector); pop_n #(width) p2( pop[2], vector); pop_tree #(width,bits) p3( pop[3], vector); pop_s3 #(width,bits) p4( pop[4], vector, clock); int err_count[max_units]; initial begin for ( int i=0; i<numtests; i++ ) begin for ( int j=0; j<width; j+=32 ) vector[j +: 32 ] = $random(); pop[0] = $countbits( vector, 1 ); #1000; for ( int mut=1; mut<num_units; mut++ ) begin if ( pop[0] !== pop[mut] ) begin err_count[mut]++; if ( err_count[mut] < err_limit ) $display("Wrong answer in %s: %d != %d (correct)\n", names[mut], pop[mut], pop[0]); end end end for ( int mut=1; mut<num_units; mut++ ) $display("Mut %s had %d errors, %.2f%%\n", names[mut], err_count[mut], 100.0 * err_count[mut]/numtests); $finish(2); end endmodule // cadence translate_on `ifdef xxx Synthesizing with args "-to_mapped -effort high" 128 bits Module Name Area Delay Delay Actual Target pop_c 271414 3246 100 pop_n 396401 3296 100 pop_tree 337278 3480 100 pop_c 300928 3192 3000 pop_n 390596 3280 3000 pop_tree 351155 3302 3000 pop_c 240371 3499 3500 pop_n 297622 3500 3500 pop_tree 276932 3500 3500 pop_c 95937 3988 4000 pop_n 158029 3997 4000 pop_tree 202811 3995 4000 pop_c 92284 4749 5000 pop_n 123406 4737 5000 pop_tree 200568 4998 5000 pop_c 89860 4699 100000 pop_n 119922 5514 100000 pop_tree 168896 5246 100000 Synthesizing with args "-to_mapped -effort high" and params " 32 ". Module Name Area Delay Delay Actual Target pop_c_width32 29069 2468 100 pop_n_width32 79872 2040 100 pop_tree_width32 87727 1874 100 pop_c_width32 24660 2775 3000 pop_n_width32 32970 2766 3000 pop_tree_width32 39937 2828 3000 pop_c_width32 21467 3325 3500 pop_n_width32 30621 3396 3500 pop_tree_width32 39307 2940 3500 pop_c_width32 21467 3307 4000 pop_n_width32 29957 3305 4000 pop_tree_width32 39488 3383 4000 pop_c_width32 21467 3307 5000 pop_n_width32 29957 3305 5000 pop_tree_width32 39620 3383 5000 pop_c_width32 21467 3307 100000 pop_n_width32 28625 3459 100000 pop_tree_width32 39318 3419 100000 `endif