The optimization of the production of a sparse symmetric matrix with a dense vector has existed for a long time. Someone once compared the matrix operations of Mathematica with MATLAB, and then came to the conclusion: the slowness is due to the copying of the data. Now I want to solve this problem through Mathematica's LibraryLink, such as calling C++ Armadillo or C++ Eigen, without copying data. In the past, Szabolcs's LTemplate successfully called Armadillo, and Henrik Schumacher successfully called Eigen. Now I have three methods to calculate the production of a sparse symmetric matrix with a dense vector. I can use Mathematica alone, calling Armadillo through librarylink and calling Eigen through librarylink. My code as follow:
- Construct a sparse symmetric matrix H
Clear["`*"] n=300; nn=n*n; v=1.; {minbegin,minend}={Range[n+1,(n-3)*n+1,2*n],Range[2*n,(n-2)*n,2*n]}; H0=SparseArray[Automatic,{nn,nn},0,{1,{Flatten[Range[0,nn-n,1]~Append~ConstantArray[nn-n,n]],n+{#}&/@Range[1,nn-n,1]},ConstantArray[v,nn-n]}]+SparseArray[(#->v)&/@({Table[{i,i+n-1},{i,Range[minbegin[[#]]+1,minend[[#]],1]}]&/@Range[Length[minbegin]]}//Flatten//Partition[#,2]&),{nn,nn}]+SparseArray[(#->v)&/@({Table[{i,i+n*(n-1)},{i,1,n,1}]}//Flatten//Partition[#,2]&),{nn,nn}]+SparseArray[(#->v)&/@({Table[{i,i+n*(n-1)+1},{i,1,n-1,1}]}//Flatten//Partition[#,2]&),{nn,nn}]+SparseArray[(#->v)&/@({Table[{i,i+2*n-1},{i,minbegin}]}//Flatten//Partition[#,2]&),{nn,nn}]+SparseArray[(#->v)&/@({Table[{i,i+n*(n-2)+1},{i,{n}}]}//Flatten//Partition[#,2]&),{nn,nn}]; H=H0+Transpose[H0]; - Construct a dense vector
f1=(Cos[2*Pi*#]&/@RandomReal[{0,1},nn]); - only use Mathematica
in: AbsoluteTiming[H.H.H.H.H.H.H.H.H.H.f1 // Total] out: {1.6734,7.63163*10^6} - Write a function to call Armadillo to implement the production, where I use shared memory. (You need to install LTemplate and Armadillo)
Needs["LTemplate`"] code="#include <LTemplate.h> #define ARMA_COUT_STREAM mma::mout #define ARMA_CERR_STREAM mma::mout #include <armadillo> template<typename T> arma::Mat<T> toArmaTransposed(mma::MatrixRef<T> m) { return arma::Mat<T>(m.data(), m.cols(), m.rows(), false /* do not copy */, false /* until resized */); } template<typename T> mma::TensorRef<T> fromArmaTransposed(const arma::Mat<T> &m) { return mma::makeMatrix<T>(m.n_cols, m.n_rows, m.memptr()); } template<typename T> arma::Col<T> toArmaVec(mma::TensorRef<T> v) { return arma::Col<T>(v.data(), v.size(), false /* do not copy */, false /* until resized */); } template<typename T> mma::TensorRef<T> fromArmaVec(const arma::Col<T> &v) { return mma::makeVector<T>(v.size(), v.memptr()); } template<typename T> arma::SpMat<T> toArmaSparseTransposed(mma::SparseMatrixRef<T> sm) { return arma::SpMat<T>( arma::conv_to<arma::uvec>::from(toArmaVec(sm.columnIndices())) - 1, // convert to 0-based indices; Mathematica uses 1-based ones. arma::conv_to<arma::uvec>::from(toArmaVec(sm.rowPointers())), toArmaVec(sm.explicitValues()), sm.cols(), sm.rows() ); } template<typename T> mma::SparseMatrixRef<T> fromArmaSparse(const arma::SpMat<T> &am) { auto pos = mma::makeMatrix<mint>(am.n_nonzero, 2); // positions array auto vals = mma::makeVector<double>(am.n_nonzero); // values array mint i = 0; for (typename arma::SpMat<T>::const_iterator it = am.begin(); it != am.end(); ++it, ++i) { vals[i] = *it; pos(i,0) = it.row() + 1; // convert 0-based index to 1-based pos(i,1) = it.col() + 1; } auto mm = mma::makeSparseMatrix(pos, vals, am.n_rows, am.n_cols); pos.free(); vals.free(); return mm; } class Arma { public: mma::RealTensorRef ArmadilloDot(mma::SparseMatrixRef<double> mmaH, mma::RealTensorRef mmaf2) { arma::sp_mat H = toArmaSparseTransposed(mmaH); arma::vec f2 = toArmaVec(mmaf2); arma::vec F3; F3 = H*H*H*H*H*H*H*H*H*H*f2; return fromArmaVec<double>(F3); } };"; Export["Arma.h",code,"String"] template= LClass["Arma", { LFun["ArmadilloDot",{{LType[SparseArray,Real,2],"Shared"},{Real,1,"Shared"}},{Real,1}] } ]; CompileTemplate[template, "IncludeDirectories"->{"C:\\Users\\sidy\\AppData\\Roaming\\Mathematica\\Applications\\LTemplate\\armadillo-10.5.1\\include"}, "LibraryDirectories"->{"C:\\Users\\sidy\\AppData\\Roaming\\Mathematica\\Applications\\LTemplate\\armadillo-10.5.1\\examples\\lib_win64"}, "Libraries"->{"libopenblas"},"CompileOptions"->{"-std=c++11","-O2","-fopenmp"}] LoadTemplate[template] arma=Make[Arma] in: AbsoluteTiming[arma@"ArmadilloDot"[H, f1] // Total] out: {1.11007,7.63163*10^6} - Write a function to call Eigen to implement the production, where I use shared memory. (You need to install Eigen)
srcpath="~"; outpath="~"; Needs["CCompilerDriver`"]; Module[{opts,path,file,lib},If[!FileExistsQ[srcpath],CreateDirectory[srcpath]]; If[!FileExistsQ[outpath],CreateDirectory[outpath]]; file=Export[FileNameJoin[{srcpath,"cClipGeneralizedEigenvalues.cpp"}]," #include <iostream> #include <vector> #include\"WolframLibrary.h\" #include \"WolframSparseLibrary.h\" #include<Eigen/Eigenvalues> #include <Eigen/Sparse> using namespace std; using namespace Eigen; EXTERN_C DLLEXPORT int cClipGeneralizedEigenvalues(WolframLibraryData libData, mint Argc, MArgument *Args, MArgument Res) { MTensor MArow = MArgument_getMTensor(Args[0]); MTensor MAcol = MArgument_getMTensor(Args[1]); MTensor MAval = MArgument_getMTensor(Args[2]); MTensor MBvec = MArgument_getMTensor(Args[3]); MTensor MCout; libData->MTensor_new(MType_Real, 1, libData->MTensor_getDimensions(MBvec), &MCout); mint Mlength = MArgument_getInteger(Args[4]); mint n = MArgument_getInteger(Args[5]); Eigen::Map<Eigen::VectorXd >Arow(libData->MTensor_getRealData(MArow),Mlength); Eigen::Map<Eigen::VectorXd >Acol(libData->MTensor_getRealData(MAcol),Mlength); Eigen::Map<Eigen::VectorXd >Aval(libData->MTensor_getRealData(MAval),Mlength); Eigen::Map<Eigen::VectorXd >B(libData->MTensor_getRealData(MBvec),n); Eigen::Map<Eigen::VectorXd >C(libData->MTensor_getRealData(MCout),n); Eigen::SparseMatrix<double> A(n, n); vector < Triplet < double > > triplets ; for ( int i = 0 ; i < Mlength ; ++ i ) { triplets . emplace_back ( Arow(i) , Acol(i) , Aval(i)) ; } A . setFromTriplets ( triplets . begin ( ) , triplets . end ( ) ) ; C = A*A*A*A*A*A*A*A*A*A*B; MArgument_setMTensor(Res, MCout); return 0; }","Text"]; lib=CreateLibrary[{file},"cClipGeneralizedEigenvalues","TargetDirectory"->outpath,"IncludeDirectories"->{"C:\\Users\\sidy\\AppData\\Roaming\\Mathematica\\Applications\\eigen-3.4-rc1"},"CompileOptions"->{"-std=c++11","-O2","-fopenmp"}]; With[{libfile=lib},cClipGeneralizedEigenvalues::usage=""; cClipGeneralizedEigenvalues:=cClipGeneralizedEigenvalues=LibraryFunctionLoad[libfile,"cClipGeneralizedEigenvalues",{{Real,1,"Shared"},{Real,1,"Shared"},{Real,1,"Shared"},{Real,1,"Shared"},Integer,Integer},{Real,1,Automatic}];]] in: HCSR=ArrayRules[H][[1;;-2]]/.Rule->List//Flatten//Partition[#,3]&; {row,col,val}={HCSR[[;;,1]]-1,HCSR[[;;,2]]-1,Developer`ToPackedArray[HCSR[[;;,3]]]}; AbsoluteTiming[cClipGeneralizedEigenvalues[row,col,val,f1,Length[val],nn]//Total] out: {2.0971,7.63163*10^6} - Compare
tab=Table[ nn=n*n; v=1.; {minbegin,minend}={Range[n+1,(n-3)*n+1,2*n],Range[2*n,(n-2)*n,2*n]}; H0=SparseArray[Automatic,{nn,nn},0,{1,{Flatten[Range[0,nn-n,1]~Append~ConstantArray[nn-n,n]],n+{#}&/@Range[1,nn-n,1]},ConstantArray[v,nn-n]}]+SparseArray[(#->v)&/@({Table[{i,i+n-1},{i,Range[minbegin[[#]]+1,minend[[#]],1]}]&/@Range[Length[minbegin]]}//Flatten//Partition[#,2]&),{nn,nn}]+SparseArray[(#->v)&/@({Table[{i,i+n*(n-1)},{i,1,n,1}]}//Flatten//Partition[#,2]&),{nn,nn}]+SparseArray[(#->v)&/@({Table[{i,i+n*(n-1)+1},{i,1,n-1,1}]}//Flatten//Partition[#,2]&),{nn,nn}]+SparseArray[(#->v)&/@({Table[{i,i+2*n-1},{i,minbegin}]}//Flatten//Partition[#,2]&),{nn,nn}]+SparseArray[(#->v)&/@({Table[{i,i+n*(n-2)+1},{i,{n}}]}//Flatten//Partition[#,2]&),{nn,nn}]; H=H0+Transpose[H0]; f1=(Cos[2*Pi*#]&/@RandomReal[{0,1},nn]); {AbsoluteTiming[arma@"ArmadilloDot"[H,f1]//Total][[1]], AbsoluteTiming[H.H.H.H.H.H.H.H.H.H.f1//Total][[1]], HCSR=ArrayRules[H][[1;;-2]]/.Rule->List//Flatten//Partition[#,3]&; {row,col,val}={HCSR[[;;,1]]-1,HCSR[[;;,2]]-1,Developer`ToPackedArray[HCSR[[;;,3]]]}; AbsoluteTiming[cClipGeneralizedEigenvalues[row,col,val,f1,Length[val],nn]//Total][[1]]},{n,100,1000,100}] ListLinePlot[{Thread[{Range[100,1000,100],tab[[;;,1]]}],Thread[{Range[100,1000,100],tab[[;;,2]]}],Thread[{Range[100,1000,100],tab[[;;,3]]}]},PlotLegends->{"Armadillo","Mathematica","Eigen"},AxesLabel->{"n(dimension of matrix is n^2)","time(s)"}] I found that Armadillo took the shortest time, followed by Mathematica and finally by Eigen. Is there any way to speed up my code?

Dot[matrix, Dot[matrix, vectror]]to ensure matrix vector priduct, try low level functions, e.g. reference.wolfram.com/language/LowLevelLinearAlgebra/ref/…, or call blas directly, another option is to use cuda variant of blas $\endgroup$