libpappsomspp
Library for mass spectrometry
pappso::Enzyme Class Reference

#include <enzyme.h>

Public Member Functions

 Enzyme ()
 build the default enzyme (trypsin) with recognition_site = "([KR])([^P])" More...
 
 Enzyme (const QString &recognition_site)
 build any enzyme given a recognition_site More...
 
 ~Enzyme ()
 
void eat (std::int8_t sequence_database_id, const ProteinSp &protein_sp, bool is_decoy, EnzymeProductInterface &enzyme_product) const
 digest a protein into enzyme products More...
 
void setMiscleavage (unsigned int miscleavage)
 sets the maximum number of missed cleavage allowed in the digestion More...
 
unsigned int getMiscleavage () const
 get the maximum number of missed cleavage allowed in the digestion More...
 
void setTakeOnlyFirstWildcard (bool take_only_first_wildcard)
 take only first m_takeOnlyFirstWildcard More...
 
void setMaxPeptideVariantListSize (std::size_t max_peptide_variant_list_size)
 if there are wildcards in the protein sequence : restrict the number of possible peptide sequences More...
 
const QRegularExpression & getQRegExpRecognitionSite () const
 

Private Member Functions

void sanityCheck (EnzymeProductInterface &enzyme_product, std::int8_t sequence_database_id, const ProteinSp &protein_sp, bool is_decoy, const PeptideStr &peptide, unsigned int start, bool is_nter, unsigned int missed_cleavage_number, bool semi_enzyme) const
 
void replaceWildcards (std::vector< std::string > *p_peptide_variant_list) const
 

Private Attributes

QRegularExpression m_recognitionSite
 example with a kinase == [K,R] More...
 
unsigned int m_miscleavage = 0
 
bool m_takeOnlyFirstWildcard = false
 
std::size_t m_maxPeptideVariantListSize = 100
 
std::vector< char > m_wildCardX
 
std::vector< char > m_wildCardB
 
std::vector< char > m_wildCardZ
 

Detailed Description

Definition at line 31 of file enzyme.h.

Constructor & Destructor Documentation

◆ Enzyme() [1/2]

pappso::Enzyme::Enzyme ( )

build the default enzyme (trypsin) with recognition_site = "([KR])([^P])"

Definition at line 32 of file enzyme.cpp.

33 {
34  m_recognitionSite.setPattern("([KR])([^P])");
35  m_miscleavage = 0;
36 
37 
38  char vv1[] = {'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
39  'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'};
40  m_wildCardX.assign(std::begin(vv1), std::end(vv1));
41 
42  char vv2[] = {'N', 'D'};
43  m_wildCardB.assign(std::begin(vv2), std::end(vv2));
44 
45  char vv3[] = {'Q', 'E'};
46  m_wildCardZ.assign(std::begin(vv3), std::end(vv3));
47 }
QRegularExpression m_recognitionSite
example with a kinase == [K,R]
Definition: enzyme.h:89
std::vector< char > m_wildCardB
Definition: enzyme.h:97
std::vector< char > m_wildCardZ
Definition: enzyme.h:98
std::vector< char > m_wildCardX
Definition: enzyme.h:96
unsigned int m_miscleavage
Definition: enzyme.h:90

References m_miscleavage, m_recognitionSite, m_wildCardB, m_wildCardX, and m_wildCardZ.

◆ Enzyme() [2/2]

pappso::Enzyme::Enzyme ( const QString &  recognition_site)

build any enzyme given a recognition_site

Parameters
recognition_siteis a regular expression that must identify 2 motifs : one on Nter side one on Cter side

Definition at line 49 of file enzyme.cpp.

50 {
51  m_recognitionSite.setPattern(recognition_site);
52  m_miscleavage = 0;
53 
54 
55  char vv1[] = {'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
56  'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'};
57  m_wildCardX.assign(std::begin(vv1), std::end(vv1));
58 
59  char vv2[] = {'N', 'D'};
60  m_wildCardB.assign(std::begin(vv2), std::end(vv2));
61 
62  char vv3[] = {'Q', 'E'};
63  m_wildCardZ.assign(std::begin(vv3), std::end(vv3));
64 }

References m_miscleavage, m_recognitionSite, m_wildCardB, m_wildCardX, and m_wildCardZ.

◆ ~Enzyme()

pappso::Enzyme::~Enzyme ( )

Definition at line 66 of file enzyme.cpp.

67 {
68 }

Member Function Documentation

◆ eat()

void pappso::Enzyme::eat ( std::int8_t  sequence_database_id,
const ProteinSp protein_sp,
bool  is_decoy,
EnzymeProductInterface enzyme_product 
) const

digest a protein into enzyme products

Parameters
sequence_database_idinteger that references the sequence fatabase (file, stream, url...)
protein_spis the original protein to be digested
is_decoytell if the current protein is a decoy (true) or normal (false) protein
enzyme_productis the object that will receive the digestion products

Definition at line 87 of file enzyme.cpp.

91 {
92  /*
93  * for aa in self.aa_to_cut:
94  seq = seq.replace(aa, aa + ' ')
95  seq_stack = []
96  for s in seq.strip().split(' '):
97  seq_stack.append(s)
98  if len(seq_stack) > self.misscleavage + 1:
99  seq_stack.pop(0)
100  s2 = ""
101  for s_miss in seq_stack[::-1]:
102  s2 = s_miss + s2
103  yield s2
104  */
105  qDebug() << "Enzyme::eat begin ";
106  const QString sequence = protein_sp.get()->getSequence();
107  qDebug() << sequence;
108  QStringList peptide_list;
109  int pos = 0;
110  int peptide_start = 0;
111  int peptide_size = sequence.size();
112  QRegularExpressionMatch match_recognition_site =
113  m_recognitionSite.match(sequence, pos);
114  while(match_recognition_site.hasMatch())
115  {
116  pos = match_recognition_site.capturedStart(0);
117  peptide_size =
118  pos + match_recognition_site.captured(1).length() - peptide_start;
119  // qDebug() << "pos=" << pos << " peptide_start=" << peptide_start << "
120  // peptide_size=" << peptide_size << " " <<
121  // sequence.mid(peptide_start,peptide_size);
122  if(peptide_size > 0)
123  {
124  peptide_list.append(sequence.mid(peptide_start, peptide_size));
125  }
126  peptide_start += peptide_size;
127  pos = peptide_start; // all peptides MUST be consecutive
128  match_recognition_site = m_recognitionSite.match(sequence, pos);
129  }
130  peptide_size = sequence.size() - peptide_start;
131  if(peptide_size > 0)
132  {
133  peptide_list.append(sequence.mid(peptide_start, peptide_size));
134  }
135 
136  unsigned int start = 1;
137  bool is_nter = true;
138  foreach(const QString &peptide, peptide_list)
139  {
140  // enzyme_product.setPeptide(sequence_database_id, protein_sp,is_decoy,
141  // peptide, start,is_nter,0, false);
142  sanityCheck(enzyme_product,
143  sequence_database_id,
144  protein_sp,
145  is_decoy,
146  peptide,
147  start,
148  is_nter,
149  0,
150  false);
151  is_nter = false;
152  start += peptide.size();
153  }
154 
155  unsigned int miscleavage_i = 0;
156  while(miscleavage_i < m_miscleavage)
157  {
158  miscleavage_i++;
159  qDebug() << "miscleavage_i=" << miscleavage_i;
160  int chunk_number = miscleavage_i + 1;
161  unsigned int start = 1;
162  bool is_nter = true;
163 
164  for(auto i = 0; i < peptide_list.size(); ++i)
165  {
166  qDebug() << "start=" << start;
167  QStringList peptide_mis_list;
168  for(auto j = 0; (j < chunk_number) && ((i + j) < peptide_list.size());
169  j++)
170  {
171  peptide_mis_list << peptide_list.at(i + j);
172  }
173  if(peptide_mis_list.size() == chunk_number)
174  {
175  // enzyme_product.setPeptide(sequence_database_id,
176  // protein_sp,is_decoy, peptide_mis_list.join(""), start,is_nter,
177  // miscleavage_i, false);
178  sanityCheck(enzyme_product,
179  sequence_database_id,
180  protein_sp,
181  is_decoy,
182  peptide_mis_list.join(""),
183  start,
184  is_nter,
185  miscleavage_i,
186  false);
187  }
188  is_nter = false;
189  start += peptide_list.at(i).size();
190  }
191  }
192 }
void sanityCheck(EnzymeProductInterface &enzyme_product, std::int8_t sequence_database_id, const ProteinSp &protein_sp, bool is_decoy, const PeptideStr &peptide, unsigned int start, bool is_nter, unsigned int missed_cleavage_number, bool semi_enzyme) const
Definition: enzyme.cpp:276

References m_miscleavage, m_recognitionSite, and sanityCheck().

◆ getMiscleavage()

unsigned int pappso::Enzyme::getMiscleavage ( ) const

get the maximum number of missed cleavage allowed in the digestion

Returns
miscleavage maximum number of missed cleavade to allow (defaults is 0)

Definition at line 76 of file enzyme.cpp.

77 {
78  return m_miscleavage;
79 }

References m_miscleavage.

◆ getQRegExpRecognitionSite()

const QRegularExpression & pappso::Enzyme::getQRegExpRecognitionSite ( ) const

Definition at line 353 of file enzyme.cpp.

354 {
355  return m_recognitionSite;
356 }

References m_recognitionSite.

◆ replaceWildcards()

void pappso::Enzyme::replaceWildcards ( std::vector< std::string > *  p_peptide_variant_list) const
private

Definition at line 195 of file enzyme.cpp.

196 {
197  std::string new_peptide = p_peptide_variant_list->at(0);
198  qDebug() << "Enzyme::replaceWildcards begin " << new_peptide.c_str();
199  std::vector<std::string> old_peptide_variant_list;
200  old_peptide_variant_list.assign(p_peptide_variant_list->begin(),
201  p_peptide_variant_list->end());
202 
203 
204  for(char wildcard : {'X', 'B', 'Z'})
205  {
206 
207  std::size_t position = new_peptide.find(wildcard);
208  if(position == std::string::npos)
209  {
210  continue;
211  }
212  else
213  {
214  p_peptide_variant_list->clear();
215  /*
216  new_peptide[position] = 'A';
217  p_peptide_variant_list->push_back(new_peptide);
218  break;
219  */
220 
221  const std::vector<char> *p_x_replace_wildcard = nullptr;
222  if(wildcard == 'X')
223  {
224  p_x_replace_wildcard = &m_wildCardX;
225  }
226  else if(wildcard == 'B')
227  {
228  p_x_replace_wildcard = &m_wildCardB;
229  }
230  else if(wildcard == 'Z')
231  {
232  p_x_replace_wildcard = &m_wildCardZ;
233  }
234 
235  if(p_x_replace_wildcard != nullptr)
236  {
237  for(std::string orig_peptide : old_peptide_variant_list)
238  {
239  for(char replace : *p_x_replace_wildcard)
240  {
241  orig_peptide[position] = replace;
242  p_peptide_variant_list->push_back(orig_peptide);
243  }
244  }
245  }
246  else
247  {
248  throw ExceptionNotPossible(
249  QObject::tr("x_replace_wildcard is empty"));
250  }
251  // new_peptide[position] = 'A';
252  // p_peptide_variant_list->push_back(new_peptide);
253  // p_peptide_variant_list->resize(1);
254  // std::cerr << "Enzyme::replaceWildcards begin
255  // p_peptide_variant_list.size()=" << p_peptide_variant_list->size()
256  // <<
257  // endl;
258  break;
259  }
260  }
261  std::vector<std::string>().swap(
262  old_peptide_variant_list); // clear old_peptide_variant_list reallocating
263 
264 
265  qDebug() << "Enzyme::replaceWildcards end " << new_peptide.c_str();
266 }

References m_wildCardB, m_wildCardX, and m_wildCardZ.

Referenced by sanityCheck().

◆ sanityCheck()

void pappso::Enzyme::sanityCheck ( EnzymeProductInterface enzyme_product,
std::int8_t  sequence_database_id,
const ProteinSp protein_sp,
bool  is_decoy,
const PeptideStr peptide,
unsigned int  start,
bool  is_nter,
unsigned int  missed_cleavage_number,
bool  semi_enzyme 
) const
private

Definition at line 276 of file enzyme.cpp.

285 {
286  if(peptide.contains('X') || peptide.contains('B') || peptide.contains('Z'))
287  {
288 
289  std::vector<std::string> peptide_variant_list;
290  peptide_variant_list.push_back(peptide.toStdString());
291 
292  while((peptide_variant_list.at(0).find('X') != std::string::npos) ||
293  (peptide_variant_list.at(0).find('B') != std::string::npos) ||
294  (peptide_variant_list.at(0).find('Z') != std::string::npos))
295  {
296  replaceWildcards(&peptide_variant_list);
297  if(peptide_variant_list.size() > m_maxPeptideVariantListSize)
298  {
299  peptide_variant_list.resize(m_maxPeptideVariantListSize);
300  peptide_variant_list.shrink_to_fit();
301  }
302  }
303 
304  // peptide_variant_list.resize(2);
306  {
307  enzyme_product.setPeptide(sequence_database_id,
308  protein_sp,
309  is_decoy,
310  QString(peptide_variant_list.at(0).c_str()),
311  start,
312  is_nter,
313  missed_cleavage_number,
314  semi_enzyme);
315  }
316  else
317  {
318  std::string peptide_variant = peptide_variant_list.back();
319  while(peptide_variant_list.size() > 0)
320  {
321  enzyme_product.setPeptide(sequence_database_id,
322  protein_sp,
323  is_decoy,
324  QString(peptide_variant.c_str()),
325  start,
326  is_nter,
327  missed_cleavage_number,
328  semi_enzyme);
329  peptide_variant_list.pop_back();
330  if(peptide_variant_list.size() > 0)
331  {
332  peptide_variant = peptide_variant_list.back();
333  }
334  }
335  }
336  std::vector<std::string>().swap(
337  peptide_variant_list); // clear peptide_variant_list reallocating
338  }
339  else
340  {
341  enzyme_product.setPeptide(sequence_database_id,
342  protein_sp,
343  is_decoy,
344  peptide,
345  start,
346  is_nter,
347  missed_cleavage_number,
348  semi_enzyme);
349  }
350 }
std::size_t m_maxPeptideVariantListSize
Definition: enzyme.h:93
void replaceWildcards(std::vector< std::string > *p_peptide_variant_list) const
Definition: enzyme.cpp:195
bool m_takeOnlyFirstWildcard
Definition: enzyme.h:91

References m_maxPeptideVariantListSize, m_takeOnlyFirstWildcard, replaceWildcards(), and pappso::EnzymeProductInterface::setPeptide().

Referenced by eat().

◆ setMaxPeptideVariantListSize()

void pappso::Enzyme::setMaxPeptideVariantListSize ( std::size_t  max_peptide_variant_list_size)

if there are wildcards in the protein sequence : restrict the number of possible peptide sequences

Parameters
max_peptide_variant_list_sizemaximum number of peptide variant (default is 100)

Definition at line 81 of file enzyme.cpp.

82 {
83  m_maxPeptideVariantListSize = max_peptide_variant_list_size;
84 }

References m_maxPeptideVariantListSize.

◆ setMiscleavage()

void pappso::Enzyme::setMiscleavage ( unsigned int  miscleavage)

sets the maximum number of missed cleavage allowed in the digestion

Parameters
miscleavagemaximum number of missed cleavade to allow (defaults is 0)

Definition at line 71 of file enzyme.cpp.

72 {
73  m_miscleavage = miscleavage;
74 }

References m_miscleavage.

◆ setTakeOnlyFirstWildcard()

void pappso::Enzyme::setTakeOnlyFirstWildcard ( bool  take_only_first_wildcard)

take only first m_takeOnlyFirstWildcard

Parameters
booltrue : switch to take only the first possibility if there are X, B or Z wildcards in sequence

Definition at line 269 of file enzyme.cpp.

270 {
271  m_takeOnlyFirstWildcard = take_only_first_wildcard;
272 }

References m_takeOnlyFirstWildcard.

Member Data Documentation

◆ m_maxPeptideVariantListSize

std::size_t pappso::Enzyme::m_maxPeptideVariantListSize = 100
private

Definition at line 93 of file enzyme.h.

Referenced by sanityCheck(), and setMaxPeptideVariantListSize().

◆ m_miscleavage

unsigned int pappso::Enzyme::m_miscleavage = 0
private

Definition at line 90 of file enzyme.h.

Referenced by Enzyme(), eat(), getMiscleavage(), and setMiscleavage().

◆ m_recognitionSite

QRegularExpression pappso::Enzyme::m_recognitionSite
private

example with a kinase == [K,R]

Definition at line 89 of file enzyme.h.

Referenced by Enzyme(), eat(), and getQRegExpRecognitionSite().

◆ m_takeOnlyFirstWildcard

bool pappso::Enzyme::m_takeOnlyFirstWildcard = false
private

Definition at line 91 of file enzyme.h.

Referenced by sanityCheck(), and setTakeOnlyFirstWildcard().

◆ m_wildCardB

std::vector<char> pappso::Enzyme::m_wildCardB
private

Definition at line 97 of file enzyme.h.

Referenced by Enzyme(), and replaceWildcards().

◆ m_wildCardX

std::vector<char> pappso::Enzyme::m_wildCardX
private

Definition at line 96 of file enzyme.h.

Referenced by Enzyme(), and replaceWildcards().

◆ m_wildCardZ

std::vector<char> pappso::Enzyme::m_wildCardZ
private

Definition at line 98 of file enzyme.h.

Referenced by Enzyme(), and replaceWildcards().


The documentation for this class was generated from the following files: