I am making a signed multiplier for that i have created half adder, full adder, ripple carry adder and then finally a multiplier. The code is shown below. How can i make it faster to achieve better timing. My final task is to make a fir filter operating running at 100Mhz. This filter use multiple multiplication operation(by using my multiplier). So Can you help me to make my design better by some optimizing technique like pipelining or parallelism or other??
//half adder
entity half_adder is
port (
a, b : in std_logic ;
sum, cout : out std_logic );
end half_adder;
architecture version1 of half_adder is
begin
sum <= a xor b;
cout <= a and b;
end version1;
-- full adder
entity full_adder is
port(
a : in std_logic;
b : in std_logic;
cin : in std_logic;
sum : out std_logic;
cout : out std_logic
);
end full_adder;
architecture structural of full_adder is
component half_adder
port (
a, b : in std_logic ;
sum, cout : out std_logic );
end component;
signal s1, c1, c2 : std_logic ;
begin -- structural
half_adder1 : half_adder
port map (
a => a, b => b,
sum => s1, cout => c1);
half_adder2 : half_adder
port map (
a =>cin, b => s1,
sum => sum, cout => c2);
cout <= c1 or c2;
end structural ;
-- ripple carry adder 40 bit
entity rca40bit is
generic (
width: integer := 40
);
port(
a : in std_logic_vector(width-1 downto 0);
b : in std_logic_vector(width-1 downto 0);
cin : in std_logic;
sum : out std_logic_vector(width-1 downto 0);
cout : out std_logic
);
end rca40bit;
architecture Behavioral of rca40bit is
component full_adder
port(
a : in std_logic;
b : in std_logic;
cin : in std_logic;
sum : out std_logic;
cout : out std_logic
);
end component;
signal s: std_logic_vector(width downto 0);
begin
s(0)<=cin;
FA:for i in 0 to width-1 generate
FA_i:full_adder
port map
(
a=>a(i),b=>b(i),cin=>s(i),sum=>sum(i),cout=>s(i+1)
);
end generate;
cout<=s(width);
end Behavioral;
-- Multiplier,i calculated partial products first then add them by rca adder.
library IEEE;
use IEEE.STD_LOGIC_1164.ALL;
use ieee.numeric_std.all;
entity signed_mult is
generic (
m : integer := 24; -- Multiplicand
n : integer := 16 ; -- multiplier;
bit1 :integer := 39 -- size of the adder
);
port(
a : in std_logic_vector(m - 1 downto 0) ;
b : in std_logic_vector(n - 1 downto 0) ;
prod : out std_logic_vector(bit1 downto 0)
);
end entity signed_mult;
architecture Behavioral of signed_mult is
component rca40bit
port(
a : in std_logic_vector(bit1 downto 0);
b : in std_logic_vector(bit1 downto 0);
cin : in std_logic;
sum : out std_logic_vector(bit1 downto 0);
cout : out std_logic
);
end component;
--partial products signals
signal p0 : std_logic_vector(bit1 downto 0):=(others =>'0') ; --1st
`enter code here`partial product size m-1 bits (24 bit)
signal p1: std_logic_vector (bit1 downto 0):=(others =>'0'); --2nd
partial product size m bits (25 bit)
signal p2 : std_logic_vector (bit1 downto 0):=(others =>'0'); --3rd partial
product size m+1 bits (26 bit)
signal p3 : std_logic_vector (bit1 downto 0):=(others =>'0'); --4th partial
product size m+2 bits (27 bit)
signal p4 : std_logic_vector (bit1 downto 0):=(others =>'0'); --5th
partial product size m+3 bits (28 bit)
signal p5 : std_logic_vector (bit1 downto 0):=(others =>'0'); --6th
partial product size m+4 bits (29 bit)
signal p6 : std_logic_vector (bit1 downto 0):=(others =>'0'); --7th
partial product size m+5 bits (30 bit)
signal p7 : std_logic_vector (bit1 downto 0):=(others =>'0'); --8th
partial product size m+6 bits (31 bit)
signal p8 : std_logic_vector (bit1 downto 0):=(others =>'0'); --9th
partial product size m+7 bits (32 bit)
signal p9 : std_logic_vector (bit1 downto 0):=(others =>'0'); --10th
partial product size m+8 bits (33 bit)
signal p10 : std_logic_vector (bit1 downto 0):=(others =>'0'); --11th
partial product size m+9 bits (34 bit)
signal p11 : std_logic_vector (bit1 downto 0):=(others =>'0'); --12th
partial product size m+10 bits (35 bit)
signal p12 : std_logic_vector (bit1 downto 0):=(others =>'0'); --13th
partial product size m+11 bits (36 bit)
signal p13 : std_logic_vector (bit1 downto 0):=(others =>'0'); --14th
partial product size m+12 bits (37 bit)
signal p14 : std_logic_vector (bit1 downto 0):=(others =>'0'); --15th
partial product size m+13 bits (38 bit)
signal p15 : std_logic_vector (bit1 downto 0); --16th partial product
size m+14 bits (39 bit)
signal p16 : std_logic_vector (bit1 downto 0);
signal p17 : std_logic_vector (bit1 downto 0):=(others =>'0');
signal extc0 :std_logic_vector (bit1 downto 0) ;
--constants and carry
signal zeros :std_logic_vector (bit1 downto 0) :=(others =>'0');
signal c : std_logic := '0' ;
signal c0 : std_logic :='0' ;
signal c12 : std_logic :='0' ;
signal c23 : std_logic :='0' ;
signal c34 ,c45,c56,c67,c78,c89,c910,c1011,c1112,c1213,c1314,c1415,c1516 ,c1617: std_logic := '0';
signal c15 : std_logic_vector (bit1 downto 0):=(others =>'0');
--sum only
signal sum12 : std_logic_vector (bit1 downto 0);
signal sum23 : std_logic_vector (bit1 downto 0);
signal sum34, sum45,sum56,sum67,sum78,
sum89,sum910,sum1011,sum1112,sum1213,sum1314,sum1415,sum1516,sum1617,ps0:
std_logic_vector (bit1 downto 0);
--signal f_sum: std_logic_vector (bit1 downto 0); --sum of p0 and P1
--signal tpp1 : std_logic_vector (bit1 downto 0);
begin
pp0: process is
begin
fst_partial_product :for i in 0 to m-1 loop -- 0 to 23
p0(i)<= (a(i) and b(0)) ;
if i=m-1 then
p0(i)<=not ( a(i) and b(0)); --msb=m=24
end if ;
end loop fst_partial_product ;
n2d_partial_product : for i in 0 to m-1 loop -- 24
p1(0)<='0';
p1(i+1)<=a(i) and b(1) ;
if i=m-1 then
p1(i+1)<=not (a(i) and b(1)); --msb=m=24
end if ;
end loop n2d_partial_product;
r3d_partial_product : for i in 0 to m-1 loop
p2(0)<='0';
p2(1)<='0';
p2(i+2)<=a(i) and b(2) ;
if i=m-1 then
p2(i+2)<=not( a(i) and b(2)); --msb=25
end if ;
end loop r3d_partial_product;
r4th_partial_product : for i in 0 to m-1 loop
p3(0)<='0';
p3(1)<='0';
p3(2)<='0';
p3(i+3)<=a(i) and b(3);
if i=m-1 then
p3(i+3)<=not( a(i) and b(3)); --msb=26
end if ;
end loop r4th_partial_product;
r5d_partial_product : for i in 0 to m-1 loop
p4(0)<='0';
p4(1)<='0';
p4(2)<='0';
p4(3)<='0';
p4(i+4)<=a(i) and b(4) ;
if i=m-1 then
p4(i+4)<=not( a(i) and b(4)); --msb=27
end if ;
end loop r5d_partial_product;
t6hpartial_product : for i in 0 to m-1 loop
p5(0)<='0';
p5(1)<='0';
p5(2)<='0';
p5(3)<='0';
p5(4)<='0';
p5(i+5)<=a(i) and b(5) ;
if i=m-1 then
p5(i+5)<=not( a(i) and b(5)); --msb28
end if;
end loop t6hpartial_product ;
t7h_partial_product : for i in 0 to m-1 loop
p6(0)<='0';
p6(1)<='0';
p6(2)<='0';
p6(3)<='0';
p6(4)<='0';
p6(5)<='0';
p6(i+6)<=a(i) and b(6) ;
if i=m-1 then
p6(i+6)<=not (a(i) and b(6)); --msb=29
end if;
end loop t7h_partial_product;
t8th_partial_product : for i in 0 to m-1 loop
p7(0)<='0';
p7(1)<='0';
p7(2)<='0';
p7(3)<='0';
p7(4)<='0';
p7(5)<='0';
p7(6)<='0';
p7(i+7)<=a(i) and b(7) ;
if i=m-1 then
p7(i+7)<=not( a(i) and b(7)); --msb=30
end if;
end loop t8th_partial_product ;
t9th_partial_product : for i in 0 to m-1 loop
p8(0)<='0';
p8(1)<='0';
p8(2)<='0';
p8(3)<='0';
p8(4)<='0';
p8(5)<='0';
p8(6)<='0';
p8(7)<='0';
p8(i+8)<=a(i) and b(8);
if i=m-1 then
p8(i+8)<=not( a(i) and b(8)); --msb=31
end if;
end loop t9th_partial_product;
t10th_partial_product : for i in 0 to m-1 loop
p9(0)<='0';
p9(1)<='0';
p9(2)<='0';
p9(3)<='0';
p9(4)<='0';
p9(5)<='0';
p9(6)<='0';
p9(7)<='0';
p9(8)<='0';
p9(i+9)<=a(i) and b(9) ;
if i=m-1 then
p9(i+9)<=not( a(i) and b(9)); --msb=32
end if;
end loop t10th_partial_product ;
t11th_partial_product : for i in 0 to m-1 loop
p10(0)<='0';
p10(1)<='0';
p10(2)<='0';
p10(3)<='0';
p10(4)<='0';
p10(5)<='0';
p10(6)<='0';
p10(7)<='0';
p10(8)<='0';
p10(9)<='0';
p10(i+10)<=a(i) and b(10) ;
if i=m-1 then
p10(i+10)<=not( a(i) and b(10)); --msb=31
end if;
end loop t11th_partial_product;
t12th_partial_product : for i in 0 to m-1 loop
p11(0)<='0';
p11(1)<='0';
p11(2)<='0';
p11(3)<='0';
p11(4)<='0';
p11(5)<='0';
p11(6)<='0';
p11(7)<='0';
p11(8)<='0';
p11(9)<='0';
p11(10)<='0';
p11(i+11)<=a(i) and b(11) ;
if i=m-1 then
p11(i+11)<=not( a(i) and b(11)); --msb=32
end if;
end loop t12th_partial_product;
t13th_partial_product : for i in 0 to m-1 loop
p12(0)<='0';
p12(1)<='0';
p12(2)<='0';
p12(3)<='0';
p12(4)<='0';
p12(5)<='0';
p12(6)<='0';
p12(7)<='0';
p12(8)<='0';
p12(9)<='0';
p12(10)<='0';
p12(11)<='0';
p12(i+12)<=a(i) and b(12) ;
if i=m-1 then
p12(i+12)<=not( a(i) and b(12)); --msb=32
end if;
end loop t13th_partial_product ;
t14th_partial_product : for i in 0 to m-1 loop
p13(0)<='0';
p13(1)<='0';
p13(2)<='0';
p13(3)<='0';
p13(4)<='0';
p13(5)<='0';
p13(6)<='0';
p13(7)<='0';
p13(8)<='0';
p13(9)<='0';
p13(10)<='0';
p13(11)<='0';
p13(12)<='0';
p13(i+13)<=a(i) and b(13) ;
if i=m-1 then
p13(i+13)<=not( a(i) and b(13)); --msb=32
end if;
end loop t14th_partial_product ;
t15th_partial_product : for i in 0 to m-1 loop
p14(0)<='0';
p14(1)<='0';
p14(2)<='0';
p14(3)<='0';
p14(4)<='0';
p14(5)<='0';
p14(6)<='0';
p14(7)<='0';
p14(8)<='0';
p14(9)<='0';
p14(10)<='0';
p14(11)<='0';
p14(12)<='0';
p14(13)<='0';
p14(14)<='0';
p14(i+14)<=a(i) and b(14) ;
if i=m-1 then
p14(i+14)<=not( a(i) and b(14)); --msb=32
end if;
end loop t15th_partial_product;
t16th_partial_product : for i in 0 to m-1 loop
p15(0)<='0';
p15(1)<='0';
p15(2)<='0';
p15(3)<='0';
p15(4)<='0';
p15(5)<='0';
p15(6)<='0';
p15(7)<='0';
p15(8)<='0';
p15(9)<='0';
p15(10)<='0';
p15(11)<='0';
p15(12)<='0';
p15(13)<='0';
p15(14)<='0';
p15(i+15)<=not (a(i) and b(15)) ;
if i=m-1 then
p15(i+15)<= (a(i) and b(15));
end if;
end loop t16th_partial_product ;
wait on a,b;
end process pp0;
p16(39)<='1';
p16(38 downto 0)<=p15(38 downto 0);
--sum
extc0(39 downto 0 )<=zeros(39 downto 24) & '1' & zeros(22 downto 0 ); -- just store 1 on msb of 1st pp
c15(39 downto 0 )<=zeros(39 downto 16) & '1' & zeros(14 downto 0 ) ; -- just store 1 on msb of 1st pp
--sum temp p0
tmp1st_pp:rca40bit
port map(a=>extc0 , b=>p0, cin=>c, sum=>ps0 ,cout=>c0); ---carry + p0
--sum temp p0
--ttmp1st_pp:rca40bit
--port map(a=>extc0 , b=>p0, cin=>c, sum=>ps1 ,cout=>c0); ---carry + p0
--sum 1st and 2nd row of pp
s1tnd_pp:rca40bit
port map(a=>ps0 , b=>p1, cin=>c, sum=>sum12 ,cout=>c12);
--adding sum of s12 and 3rd row of pp
t3rd_pp:rca40bit
port map(a=>sum12, b=>p2, cin=>c, sum=>sum23 ,cout=>c23);
--adding sum of s23 and 4rt row of pp
t4rh_pp:rca40bit
port map(a=>sum23, b=>p3, cin=>c, sum=>sum34 ,cout=>c34);
t5rh_pp:rca40bit
port map(a=>sum34, b=>p4, cin=>c, sum=>sum45 ,cout=>c45);
t6rh_pp:rca40bit
port map(a=>sum45, b=>p5, cin=>c, sum=>sum56 ,cout=>c56);
t7th_pp:rca40bit
port map(a=>sum56, b=>p6, cin=>c, sum=>sum67 ,cout=>c67);
t8th_pp:rca40bit
port map(a=>sum67, b=>p7, cin=>c, sum=>sum78 ,cout=>c78);
t9th_pp:rca40bit
port map(a=>sum78, b=>p8, cin=>c, sum=>sum89 ,cout=>c89);
t10th_pp:rca40bit
port map(a=>sum89, b=>p9, cin=>c, sum=>sum910 ,cout=>c910);
t11th_pp:rca40bit
port map(a=>sum910, b=>p10, cin=>c, sum=>sum1011 ,cout=>c1011);
t12th_pp:rca40bit
port map(a=>sum1011, b=>p11, cin=>c, sum=>sum1112 ,cout=>c1112);
t13th_pp:rca40bit
port map(a=>sum1112, b=>p12, cin=>c, sum=>sum1213 ,cout=>c1213);
t14th_pp:rca40bit
port map(a=>sum1213, b=>p13, cin=>c, sum=>sum1314 ,cout=>c1314);
t15th_pp:rca40bit
port map(a=>sum1314, b=>p14, cin=>c, sum=>sum1415,cout=>c1415);
carryaddwith15th_pp:rca40bit
port map(a=>sum1415, b=>p16, cin=>c, sum=>sum1516,cout=>c1617);
t16th_pp:rca40bit
port map(a=>c15, b=>sum1516, cin=>c, sum=>sum1617,cout=>c1516);
prod <=sum1617;
end Behavioral;
Filter Design
library IEEE;
USE IEEE.STD_LOGIC_1164.ALL;
USE IEEE.NUMERIC_STD.ALL;
entity your_filter is
Generic (
constant PIPELINE_DEPTH : positive := 2;
constant DATA_WIDTH : positive := 24;
constant FIR_ORDER : positive := 34;
constant COEFF_WIDTH : positive := 16
);
Port (
clk : in STD_LOGIC;
rst : in STD_LOGIC;
data_in : in STD_LOGIC_VECTOR (DATA_WIDTH - 1 downto 0);
data_out : out STD_LOGIC_VECTOR (DATA_WIDTH - 1 downto 0)
);
end your_filter;
architecture Behavioral of your_filter is
component ripple_ca is
port(
a : in std_logic_vector(DATA_WIDTH+COEFF_WIDTH-1 downto 0);
b : in std_logic_vector(DATA_WIDTH+COEFF_WIDTH-1 downto 0);
cin : in std_logic;
sum : out std_logic_vector(DATA_WIDTH+COEFF_WIDTH-1 downto 0);
cout : out std_logic
);
end component;
component signed_mult is
port(
a : in std_logic_vector(DATA_WIDTH - 1 downto 0) ;
b : in std_logic_vector(COEFF_WIDTH - 1 downto 0) ;
prod : out std_logic_vector(DATA_WIDTH+COEFF_WIDTH-1 downto 0)
);
end component;
type REG_TYPE is array (0 to FIR_ORDER-1) of signed (DATA_WIDTH+COEFF_WIDTH-1 downto 0);
type COEFF_ARRAY_TYPE is array (0 to FIR_ORDER) of signed(COEFF_WIDTH-1 downto 0);
type REG_TYPE2 is array (0 to FIR_ORDER-1) of std_logic_vector (DATA_WIDTH-1 downto 0);
type sum is array (0 to FIR_ORDER) of std_logic_vector(DATA_WIDTH+COEFF_WIDTH-1 downto 0);
signal sum1 : sum;
signal c1 : std_logic;
signal c : std_logic :='0';
type COEFF_ARRAY1 is array (0 to FIR_ORDER) of std_logic_vector(COEFF_WIDTH-1 downto 0);
signal coeff : COEFF_ARRAY1;
signal reg : REG_TYPE2;
signal next_reg : REG_TYPE2;
signal reg1 : REG_TYPE2;
type REG_TYPE1 is array (0 to FIR_ORDER-1) of std_logic_vector (DATA_WIDTH+COEFF_WIDTH-1 downto 0);
signal next_reg1 : REG_TYPE1;
signal data_out_temp : std_logic_vector(DATA_WIDTH+COEFF_WIDTH-1 downto 0);
signal tempmult : sum;
signal data_in_reg : STD_LOGIC_VECTOR (DATA_WIDTH - 1 downto 0);
signal next_data_out : STD_LOGIC_VECTOR (DATA_WIDTH - 1 downto 0);
constant coeff_array : COEFF_ARRAY_TYPE := (
"0000000101011111",
"0000000001100001",
"0000000000110011",
"1111111111011010",
"1111111101011111",
"1111111011011001",
"1111111001100101",
"1111111000100101",
"1111111000111011",
"1111111010111111",
"1111111110111001",
"0000000100100000",
"0000001011011000",
"0000010010110011",
"0000011001111010",
"0000011111110010",
"0000100011101011",
"0000100101000010",
"0000100011101011",
"0000011111110010",
"0000011001111010",
"0000010010110011",
"0000001011011000",
"0000000100100000",
"1111111110111001",
"1111111010111111",
"1111111000111011",
"1111111000100101",
"1111111001100101",
"1111111011011001",
"1111111101011111",
"1111111111011010",
"0000000000110011",
"0000000001100001",
"0000000101011111"
);
begin
--synchronous process
sync_proc : process (clk)
begin
if rising_edge(clk) then
if rst = '0' then
reg <= (others=>(others=>'0'));
data_in_reg <= (others=>'0');
data_out <= (others=>'0');
else
reg <= next_reg;
data_in_reg <= data_in;
data_out <= next_data_out;
end if;
end if;
end process;
--asynchronous process
async_proc : process (reg, data_in_reg)
variable sum : signed(DATA_WIDTH+COEFF_WIDTH-1 downto 0) := (others => '0');
begin
for i in 0 to FIR_ORDER -2 loop
next_reg(i+1) <= reg(i);
end loop;
next_reg(0) <= (data_in_reg);
-- data_out_temp <= std_logic_vector(sum + signed(data_in_reg)*coeff_array(0));--std_logic_vector(sum);--(sum);--
for i in 0 to FIR_ORDER-1 loop
coeff(i)<= std_logic_vector (coeff_array(i));
reg1(i)<= std_logic_vector (reg(i));
--next_reg1(i)<= std_logic_vector (next_reg(i));
end loop;
coeff(FIR_ORDER)<= std_logic_vector (coeff_array(FIR_ORDER));
end process;
--next_reg(i) <= reg(i+1)+(signed(data_in_reg))*coeff_array(i+1);
nextreg : for i in 0 to FIR_ORDER -1 generate
accumumlator : signed_mult
port map ( a => reg1(i), b => coeff(i+1), prod => tempmult(i) );
end generate nextreg;
sum1(0) <= (others => '0');
addition : for i in 0 to FIR_ORDER -1 generate
addition01 : ripple_ca
port map ( a => sum1(i),b => tempmult(i) , cin=>c, sum => sum1(i+1),cout => c1 );
end generate addition;
-- data_out_temp <= std_logic_vector(reg(0) + signed(data_in_reg)*coeff_array(0));--std_logic_vector(sum);--(sum);--
dataout : signed_mult
port map (
a => data_in_reg,
b => coeff(0),
prod => tempmult(FIR_ORDER)
);
addition_2 : ripple_ca
port map (a => sum1(FIR_ORDER),b => tempmult(FIR_ORDER) ,cin=>c , sum => data_out_temp ,cout => c1);
next_data_out <= data_out_temp(DATA_WIDTH+COEFF_WIDTH-1 downto COEFF_WIDTH);
end Behavioral;
Ваш текущий метод «Я создал полусумматор, полусумматор…» не совпадает с вашим целевым хорошее время. Для быстрого результата вам не следует создавать свой собственный сумматор / умножители, но пусть инструмент синтеза справится с этим.
@ Сложно, да, это академическое упражнение. Файл временных ограничений .xdc i hv не загружен, но генерирует тактовую частоту 10 нс, что означает 100 МГц.
@Oldfart Предмет моего курса - продвинутый vlsi, требование которого состоит в том, чтобы студенты создавали быстрые компоненты, а не путем синтеза.
Но в вашем дизайне нет часов - поэтому не имеет значения, что находится в файле XDC, поскольку вам не на что рассчитывать.
Код не анализирует, имеет комментарии, охватывающие новые строки, и отсутствующие предложения контекста. Вопрос «Так не могли бы вы помочь мне улучшить мой дизайн с помощью некоторых методов оптимизации, таких как конвейерная обработка, параллелизм или что-то еще?» появляется слишком широкий. Какой инструмент синтеза, целевое устройство или примитивная библиотека и технология, а также пути, нарушающие ограничения? Для сокращения сумматоров до n битов потребуется гораздо меньше усилий по синтезу. Покажите свои требования к заданию. Что было сделано до сих пор для идентификации и решения проблемы?
@Tricky Filter Code добавлен сейчас, вы можете увидеть использование clk.
@ user1155120 Сюда добавлен код фильтра. При этом умножение data_in на массив коэффициентов выполняется 34 раза. Моё требование к дизайну - работать с фильтром на частоте 100 МГц. В настоящее время я получаю слабину -33 нс. Мне нужно улучшить свой дизайн - это может быть дизайн множителя или дизайн фильтра.
Вам нужно добавить больше конвейерной обработки. В основном, разбейте ваше умножение. Или для лучшего результата> 200 МГц используйте встроенный умножитель.
@ Хитро, да, ты правильно говоришь. Мне нужно больше конвейерной обработки. Но я ничего не знаю о конвейерном коде и концепции. Вы можете мне помочь с конвейеризацией ??
@ Сложно, где в моем коде я могу разместить конвейеры.
@ZohaibRamzan Конвейерная обработка включает размещение регистров между разделами логики. Ваш код включает одно большое логическое облако. Вам нужно разбить его на более мелкие части. Хотя это может увеличить задержку, тактовую частоту можно увеличить более чем достаточно, чтобы компенсировать задержку.
Это академическое упражнение? обычно самые быстрые результаты достигаются при использовании специальных умножителей на микросхеме. В вашем коде, как вы ожидаете получить FMax без часов?