libpappsomspp
Library for mass spectrometry
enzyme.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2  * Copyright (c) 2015 Olivier Langella <Olivier.Langella@moulon.inra.fr>.
3  *
4  * This file is part of the PAPPSOms++ library.
5  *
6  * PAPPSOms++ is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * PAPPSOms++ is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with PAPPSOms++. If not, see <http://www.gnu.org/licenses/>.
18  *
19  * Contributors:
20  * Olivier Langella <Olivier.Langella@moulon.inra.fr> - initial API and
21  *implementation
22  ******************************************************************************/
23 
24 #include "enzyme.h"
25 #include <QStringList>
26 #include <QDebug>
27 #include "../exception/exceptionnotpossible.h"
28 //#include <iostream>
29 
30 namespace pappso
31 {
33 {
34  m_recognitionSite.setPattern("([KR])([^P])");
35  m_miscleavage = 0;
36 
37 
38  char vv1[] = {'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
39  'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'};
40  m_wildCardX.assign(std::begin(vv1), std::end(vv1));
41 
42  char vv2[] = {'N', 'D'};
43  m_wildCardB.assign(std::begin(vv2), std::end(vv2));
44 
45  char vv3[] = {'Q', 'E'};
46  m_wildCardZ.assign(std::begin(vv3), std::end(vv3));
47 }
48 
49 Enzyme::Enzyme(const QString &recognition_site)
50 {
51  m_recognitionSite.setPattern(recognition_site);
52  m_miscleavage = 0;
53 
54 
55  char vv1[] = {'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
56  'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'};
57  m_wildCardX.assign(std::begin(vv1), std::end(vv1));
58 
59  char vv2[] = {'N', 'D'};
60  m_wildCardB.assign(std::begin(vv2), std::end(vv2));
61 
62  char vv3[] = {'Q', 'E'};
63  m_wildCardZ.assign(std::begin(vv3), std::end(vv3));
64 }
65 
67 {
68 }
69 
70 void
71 Enzyme::setMiscleavage(unsigned int miscleavage)
72 {
73  m_miscleavage = miscleavage;
74 }
75 unsigned int
77 {
78  return m_miscleavage;
79 }
80 void
81 Enzyme::setMaxPeptideVariantListSize(std::size_t max_peptide_variant_list_size)
82 {
83  m_maxPeptideVariantListSize = max_peptide_variant_list_size;
84 }
85 
86 void
87 Enzyme::eat(std::int8_t sequence_database_id,
88  const ProteinSp &protein_sp,
89  bool is_decoy,
90  EnzymeProductInterface &enzyme_product) const
91 {
92  /*
93  * for aa in self.aa_to_cut:
94  seq = seq.replace(aa, aa + ' ')
95  seq_stack = []
96  for s in seq.strip().split(' '):
97  seq_stack.append(s)
98  if len(seq_stack) > self.misscleavage + 1:
99  seq_stack.pop(0)
100  s2 = ""
101  for s_miss in seq_stack[::-1]:
102  s2 = s_miss + s2
103  yield s2
104  */
105  qDebug() << "Enzyme::eat begin ";
106  const QString sequence = protein_sp.get()->getSequence();
107  qDebug() << sequence;
108  QStringList peptide_list;
109  int pos = 0;
110  int peptide_start = 0;
111  int peptide_size = sequence.size();
112  QRegularExpressionMatch match_recognition_site =
113  m_recognitionSite.match(sequence, pos);
114  while(match_recognition_site.hasMatch())
115  {
116  pos = match_recognition_site.capturedStart(0);
117  peptide_size =
118  pos + match_recognition_site.captured(1).length() - peptide_start;
119  // qDebug() << "pos=" << pos << " peptide_start=" << peptide_start << "
120  // peptide_size=" << peptide_size << " " <<
121  // sequence.mid(peptide_start,peptide_size);
122  if(peptide_size > 0)
123  {
124  peptide_list.append(sequence.mid(peptide_start, peptide_size));
125  }
126  peptide_start += peptide_size;
127  pos = peptide_start; // all peptides MUST be consecutive
128  match_recognition_site = m_recognitionSite.match(sequence, pos);
129  }
130  peptide_size = sequence.size() - peptide_start;
131  if(peptide_size > 0)
132  {
133  peptide_list.append(sequence.mid(peptide_start, peptide_size));
134  }
135 
136  unsigned int start = 1;
137  bool is_nter = true;
138  foreach(const QString &peptide, peptide_list)
139  {
140  // enzyme_product.setPeptide(sequence_database_id, protein_sp,is_decoy,
141  // peptide, start,is_nter,0, false);
142  sanityCheck(enzyme_product,
143  sequence_database_id,
144  protein_sp,
145  is_decoy,
146  peptide,
147  start,
148  is_nter,
149  0,
150  false);
151  is_nter = false;
152  start += peptide.size();
153  }
154 
155  unsigned int miscleavage_i = 0;
156  while(miscleavage_i < m_miscleavage)
157  {
158  miscleavage_i++;
159  qDebug() << "miscleavage_i=" << miscleavage_i;
160  int chunk_number = miscleavage_i + 1;
161  unsigned int start = 1;
162  bool is_nter = true;
163 
164  for(auto i = 0; i < peptide_list.size(); ++i)
165  {
166  qDebug() << "start=" << start;
167  QStringList peptide_mis_list;
168  for(auto j = 0; (j < chunk_number) && ((i + j) < peptide_list.size());
169  j++)
170  {
171  peptide_mis_list << peptide_list.at(i + j);
172  }
173  if(peptide_mis_list.size() == chunk_number)
174  {
175  // enzyme_product.setPeptide(sequence_database_id,
176  // protein_sp,is_decoy, peptide_mis_list.join(""), start,is_nter,
177  // miscleavage_i, false);
178  sanityCheck(enzyme_product,
179  sequence_database_id,
180  protein_sp,
181  is_decoy,
182  peptide_mis_list.join(""),
183  start,
184  is_nter,
185  miscleavage_i,
186  false);
187  }
188  is_nter = false;
189  start += peptide_list.at(i).size();
190  }
191  }
192 }
193 
194 void
195 Enzyme::replaceWildcards(std::vector<std::string> *p_peptide_variant_list) const
196 {
197  std::string new_peptide = p_peptide_variant_list->at(0);
198  qDebug() << "Enzyme::replaceWildcards begin " << new_peptide.c_str();
199  std::vector<std::string> old_peptide_variant_list;
200  old_peptide_variant_list.assign(p_peptide_variant_list->begin(),
201  p_peptide_variant_list->end());
202 
203 
204  for(char wildcard : {'X', 'B', 'Z'})
205  {
206 
207  std::size_t position = new_peptide.find(wildcard);
208  if(position == std::string::npos)
209  {
210  continue;
211  }
212  else
213  {
214  p_peptide_variant_list->clear();
215  /*
216  new_peptide[position] = 'A';
217  p_peptide_variant_list->push_back(new_peptide);
218  break;
219  */
220 
221  const std::vector<char> *p_x_replace_wildcard = nullptr;
222  if(wildcard == 'X')
223  {
224  p_x_replace_wildcard = &m_wildCardX;
225  }
226  else if(wildcard == 'B')
227  {
228  p_x_replace_wildcard = &m_wildCardB;
229  }
230  else if(wildcard == 'Z')
231  {
232  p_x_replace_wildcard = &m_wildCardZ;
233  }
234 
235  if(p_x_replace_wildcard != nullptr)
236  {
237  for(std::string orig_peptide : old_peptide_variant_list)
238  {
239  for(char replace : *p_x_replace_wildcard)
240  {
241  orig_peptide[position] = replace;
242  p_peptide_variant_list->push_back(orig_peptide);
243  }
244  }
245  }
246  else
247  {
248  throw ExceptionNotPossible(
249  QObject::tr("x_replace_wildcard is empty"));
250  }
251  // new_peptide[position] = 'A';
252  // p_peptide_variant_list->push_back(new_peptide);
253  // p_peptide_variant_list->resize(1);
254  // std::cerr << "Enzyme::replaceWildcards begin
255  // p_peptide_variant_list.size()=" << p_peptide_variant_list->size()
256  // <<
257  // endl;
258  break;
259  }
260  }
261  std::vector<std::string>().swap(
262  old_peptide_variant_list); // clear old_peptide_variant_list reallocating
263 
264 
265  qDebug() << "Enzyme::replaceWildcards end " << new_peptide.c_str();
266 }
267 
268 void
269 Enzyme::setTakeOnlyFirstWildcard(bool take_only_first_wildcard)
270 {
271  m_takeOnlyFirstWildcard = take_only_first_wildcard;
272 }
273 
274 
275 void
277  std::int8_t sequence_database_id,
278  const ProteinSp &protein_sp,
279  bool is_decoy,
280  const PeptideStr &peptide,
281  unsigned int start,
282  bool is_nter,
283  unsigned int missed_cleavage_number,
284  bool semi_enzyme) const
285 {
286  if(peptide.contains('X') || peptide.contains('B') || peptide.contains('Z'))
287  {
288 
289  std::vector<std::string> peptide_variant_list;
290  peptide_variant_list.push_back(peptide.toStdString());
291 
292  while((peptide_variant_list.at(0).find('X') != std::string::npos) ||
293  (peptide_variant_list.at(0).find('B') != std::string::npos) ||
294  (peptide_variant_list.at(0).find('Z') != std::string::npos))
295  {
296  replaceWildcards(&peptide_variant_list);
297  if(peptide_variant_list.size() > m_maxPeptideVariantListSize)
298  {
299  peptide_variant_list.resize(m_maxPeptideVariantListSize);
300  peptide_variant_list.shrink_to_fit();
301  }
302  }
303 
304  // peptide_variant_list.resize(2);
306  {
307  enzyme_product.setPeptide(sequence_database_id,
308  protein_sp,
309  is_decoy,
310  QString(peptide_variant_list.at(0).c_str()),
311  start,
312  is_nter,
313  missed_cleavage_number,
314  semi_enzyme);
315  }
316  else
317  {
318  std::string peptide_variant = peptide_variant_list.back();
319  while(peptide_variant_list.size() > 0)
320  {
321  enzyme_product.setPeptide(sequence_database_id,
322  protein_sp,
323  is_decoy,
324  QString(peptide_variant.c_str()),
325  start,
326  is_nter,
327  missed_cleavage_number,
328  semi_enzyme);
329  peptide_variant_list.pop_back();
330  if(peptide_variant_list.size() > 0)
331  {
332  peptide_variant = peptide_variant_list.back();
333  }
334  }
335  }
336  std::vector<std::string>().swap(
337  peptide_variant_list); // clear peptide_variant_list reallocating
338  }
339  else
340  {
341  enzyme_product.setPeptide(sequence_database_id,
342  protein_sp,
343  is_decoy,
344  peptide,
345  start,
346  is_nter,
347  missed_cleavage_number,
348  semi_enzyme);
349  }
350 }
351 
352 const QRegularExpression &
354 {
355  return m_recognitionSite;
356 }
357 } // namespace pappso
virtual void setPeptide(std::int8_t sequence_database_id, const ProteinSp &protein_sp, bool is_decoy, const PeptideStr &peptide, unsigned int start, bool is_nter, unsigned int missed_cleavage_number, bool semi_enzyme)=0
function to give the products of a protein digestion by an enzyme
QRegularExpression m_recognitionSite
example with a kinase == [K,R]
Definition: enzyme.h:89
std::size_t m_maxPeptideVariantListSize
Definition: enzyme.h:93
unsigned int getMiscleavage() const
get the maximum number of missed cleavage allowed in the digestion
Definition: enzyme.cpp:76
Enzyme()
build the default enzyme (trypsin) with recognition_site = "([KR])([^P])"
Definition: enzyme.cpp:32
void setMiscleavage(unsigned int miscleavage)
sets the maximum number of missed cleavage allowed in the digestion
Definition: enzyme.cpp:71
std::vector< char > m_wildCardB
Definition: enzyme.h:97
std::vector< char > m_wildCardZ
Definition: enzyme.h:98
std::vector< char > m_wildCardX
Definition: enzyme.h:96
void sanityCheck(EnzymeProductInterface &enzyme_product, std::int8_t sequence_database_id, const ProteinSp &protein_sp, bool is_decoy, const PeptideStr &peptide, unsigned int start, bool is_nter, unsigned int missed_cleavage_number, bool semi_enzyme) const
Definition: enzyme.cpp:276
const QRegularExpression & getQRegExpRecognitionSite() const
Definition: enzyme.cpp:353
void replaceWildcards(std::vector< std::string > *p_peptide_variant_list) const
Definition: enzyme.cpp:195
void setTakeOnlyFirstWildcard(bool take_only_first_wildcard)
take only first m_takeOnlyFirstWildcard
Definition: enzyme.cpp:269
void eat(std::int8_t sequence_database_id, const ProteinSp &protein_sp, bool is_decoy, EnzymeProductInterface &enzyme_product) const
digest a protein into enzyme products
Definition: enzyme.cpp:87
unsigned int m_miscleavage
Definition: enzyme.h:90
bool m_takeOnlyFirstWildcard
Definition: enzyme.h:91
void setMaxPeptideVariantListSize(std::size_t max_peptide_variant_list_size)
if there are wildcards in the protein sequence : restrict the number of possible peptide sequences
Definition: enzyme.cpp:81
tries to keep as much as possible monoisotopes, removing any possible C13 peaks and changes multichar...
Definition: aa.cpp:39
QString PeptideStr
A type definition for PeptideStr.
Definition: types.h:44
std::shared_ptr< const Protein > ProteinSp
shared pointer on a Protein object
Definition: protein.h:43